# Interactive Notebook

## Setup

In [2]:
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import pprint as pp
import datetime
import os

In [3]:
UDACITY_URL = "https://www.udacity.com/courses/all"
chrome_driver = "/usr/bin/chromedriver"

## Get Data from Website

In [4]:
browser = webdriver.Chrome(executable_path=chrome_driver)
browser.get(UDACITY_URL)

pop_up_xml_path = "/html/body/ir-root/ir-content/ir-autopopup-modal/ir-modal/div/div[2]/div/div[1]"

delay = 30  # seconds
try:
    popup_close_button = WebDriverWait(browser, delay).until(
        EC.presence_of_element_located(
            (
                By.XPATH, pop_up_xml_path
            )
        )
    )
    print("Course Catalog Page is ready!")
except TimeoutException:
    print("Loading Course Catalog Page took too much time!")

print("Closing pop up button")
popup_close_button.click()

Course Catalog Page is ready!
Closing pop up button


In [5]:
soup_level1 = BeautifulSoup(browser.page_source, "html.parser")

In [6]:
all_links = soup_level1.find_all('a')

In [7]:
len(all_links)

949

## Inspect and Clean Data

In [8]:
for link_tag in all_links[:10]:
    print(type(link_tag))

<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>


In [9]:
dir(link_tag)

['__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_should_pretty_print',
 'append',
 'attrs',
 'can_be_empty_element',
 'cdata_list_attributes',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decompose',
 'descendants',
 'encode',
 'encode_contents',
 'extend',
 'extract',
 'fetchNextSiblings',
 'fetchParents',
 'fetchPrevious',
 'fetchPreviousSiblings',
 'find',
 'findAl

### Inspect Tag Attributes

In [10]:
# attributes
pp.pprint(link_tag)

<a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - ios-developer-nanodegree--nd003",
          "cta_message": "ios-developer-nanodegree--nd003",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/ios-developer-nanodegree--nd003"
        }' data-analytics-event="Navbar Link Clicked" href="/course/ios-developer-nanodegree--nd003"><div _ngcontent-iridium-us-c20="" class="secondary-menu-item">iOS Developer</div></a>


Let's figure out the number of unique tag attributes combinations and their frequency:

In [39]:
# look at unique attribute keys
unique_keys_count_dict = dict()
for i, link_tag in enumerate(all_links):
    #print(f"{i:<4}: {link_tag.attrs.keys()}")
    attribute_keys = tuple(sorted(list(link_tag.attrs.keys())))
    
    unique_keys_count_dict[attribute_keys] = unique_keys_count_dict.get(attribute_keys, 0) + 1

print(f'\n{len(unique_keys_count_dict)} Unique Attributes Frequency')
for key, value in unique_keys_count_dict.items():
    print(f"{value:<4}: {key}")


25 Unique Attributes Frequency
1   : ('_ngcontent-iridium-us-c1', 'class', 'href', 'irhybridlink', 'title')
100 : ('_ngcontent-iridium-us-c20', 'data-analytics-attrs', 'data-analytics-event', 'href')
1   : ('_ngcontent-iridium-us-c8', 'data-analytics-attrs', 'data-analytics-event', 'href')
1   : ('_ngcontent-iridium-us-c8', 'class', 'href')
10  : ('_ngcontent-iridium-us-c30', 'data-analytics-attrs', 'data-analytics-event', 'href')
3   : ('_ngcontent-iridium-us-c1', 'class', 'data-analytics-attrs', 'data-analytics-event', 'href', 'title')
1   : ('_ngcontent-iridium-us-c4', 'class', 'href')
1   : ('_ngcontent-iridium-us-c4', 'class', 'href', 'title')
1   : ('_ngcontent-iridium-us-c4', 'href', 'id', 'irhybridlink', 'title')
7   : ('_ngcontent-iridium-us-c11', 'data-analytics-attrs', 'data-analytics-event', 'href')
1   : ('_ngcontent-iridium-us-c11', 'class', 'data-analytics-attrs', 'data-analytics-event', 'href')
1   : ('_ngcontent-iridium-us-c4', 'class', 'data-analytics-attrs', 'data-a

Let's figure out which ones we don't need:

In [57]:
# Helper Function

def get_tags_with_keyword_attribute(all_links, attribute):
    if attribute == '':
        print("ERROR: No attribute passed")
        return None

    for i, link_tag in enumerate(all_links):
        if attribute in link_tag.attrs.keys():
            print(f"{i}: {link_tag}")
            print('')

### Inspect link tags with 'target' attribute

In [58]:
# Investigate link tags with the 'target' attribute   
get_tags_with_keyword_attribute(all_links, 'target')

190: <a href="/legal/terms-of-use" target="_blank">Terms of
    Use</a>

191: <a href="/legal/privacy" target="_blank">Privacy Policy</a>

943: <a _ngcontent-iridium-us-c18="" class="ng-star-inserted" href="https://www.facebook.com/Udacity" id="ga-facebook" rel="nofollow noopener noreferrer" target="_blank" title="Facebook"><!-- --><!-- --><!-- --><img _ngcontent-iridium-us-c18="" class="social__icon ng-star-inserted" deferimgsrc="/assets/iridium/images/core/footer/footer-social/facebook.svg" irdeferimage="" src="/assets/iridium/images/core/footer/footer-social/facebook.svg" srcset="/assets/iridium/images/core/footer/footer-social/facebook.svg 1x"/><!-- --><!-- --></a>

944: <a _ngcontent-iridium-us-c18="" class="ng-star-inserted" href="https://twitter.com/udacity" id="ga-twitter" rel="nofollow noopener noreferrer" target="_blank" title="Twitter"><!-- --><!-- --><!-- --><img _ngcontent-iridium-us-c18="" class="social__icon ng-star-inserted" deferimgsrc="/assets/iridium/images/core/foot

These link tags are links to Udacity's Terms of Use, Privacy Policy and various Social Media links. We don't need them.

### Inspect link tags with 'style' attribute

In [59]:
get_tags_with_keyword_attribute(all_links, 'style')

128: <a _ngcontent-iridium-us-c4="" class="button button--navigation ng-star-inserted" data-analytics-attrs='{
                "category": "Navbar Link",
                "label": "Navbar Link - Header Link Clicked - Sign In",
                "cta_type": "link",
                "cta_destination": "self",
                "cta_location": "Homepage"
              }' data-analytics-event="Navbar Link Clicked" href="https://auth.udacity.com/sign-in?next=https://classroom.udacity.com/authenticated" style="display:inline" title="Sign In"> Sign In </a>

129: <a _ngcontent-iridium-us-c4="" class="ng-star-inserted button button--navigation button--primary" data-analytics-attrs='{
                "category": "Navbar Link",
                "label": "Navbar Link - Header Link Clicked - Get Started",
                "cta_type": "link",
                "cta_destination": "self",
                "cta_location": "Homepage"
              }' data-analytics-event="Navbar Link Clicked" href="https://auth.ud

These link tags are links to Udacity's Sign In/Sign Up pages. We don't need them.

### Inspect link tags with 'rel' attribute

In [60]:
get_tags_with_keyword_attribute(all_links, 'rel')

937: <a _ngcontent-iridium-us-c5="" href="/legal" id="ga-b93331" rel="nofollow">Legal &amp; Privacy</a>

943: <a _ngcontent-iridium-us-c18="" class="ng-star-inserted" href="https://www.facebook.com/Udacity" id="ga-facebook" rel="nofollow noopener noreferrer" target="_blank" title="Facebook"><!-- --><!-- --><!-- --><img _ngcontent-iridium-us-c18="" class="social__icon ng-star-inserted" deferimgsrc="/assets/iridium/images/core/footer/footer-social/facebook.svg" irdeferimage="" src="/assets/iridium/images/core/footer/footer-social/facebook.svg" srcset="/assets/iridium/images/core/footer/footer-social/facebook.svg 1x"/><!-- --><!-- --></a>

944: <a _ngcontent-iridium-us-c18="" class="ng-star-inserted" href="https://twitter.com/udacity" id="ga-twitter" rel="nofollow noopener noreferrer" target="_blank" title="Twitter"><!-- --><!-- --><!-- --><img _ngcontent-iridium-us-c18="" class="social__icon ng-star-inserted" deferimgsrc="/assets/iridium/images/core/footer/footer-social/twitter.svg" irde

These link tags overlap with the 'target' attribute tags. There is one additional tag captured which is the Udacity Legal and Privacy link. We don't need any of these links.

### Inspect link tags with 'id' attribute

In [61]:
get_tags_with_keyword_attribute(all_links, 'id')

63: <a _ngcontent-iridium-us-c4="" href="/" id="ga-294bb9" irhybridlink="/" title="Udacity"><div _ngcontent-iridium-us-c4="" class="logo-mark"><svg _ngcontent-iridium-us-c4="" alt="Udacity" height="30" viewbox="0 0 30 30" width="30"><path _ngcontent-iridium-us-c4="" d="M29 .5l1 .5v13c0 5.551887-2.8897 8.695692-5.995216 10.099885L24.0094 24.1l-7.620963 4.388562c-.05877.03456-.11776.068527-.176957.101902l-.08642.049764.001806-.00254C14.48289 29.545895 12.684682 30 11 30 6 30 0 26 0 18V6l2 1v11c0 8 6 10 9 10 1.858706 0 4.86901-.76773 6.89043-3.254517C14.181726 23.872636 10 20.678703 10 14V2.2L2 7 0 6l10-6 1 .5 1 .5v13c0 6.939958 4.515283 8.612086 7.017542 8.933368C19.625786 21.62159 20 19.997902 20 18V5l2 1v12c0 1.852645-.321777 3.49077-.878395 4.91438C23.648223 22.549217 28 20.813144 28 14V2.25L22 6l-2-1 8-5 1 .5z" fill="#02B3E4" fill-rule="evenodd" id="mark"></path></svg></div></a>

873: <a _ngcontent-iridium-us-c5="" href="/" id="ga-71dc5f" title="Udacity"><div _ngcontent-iridium-us-c5

These link tags overlap with the 'target' attribute tags. There are a couple additional tags captured, which don't seem required. We don't need any of these links.

### Inspect link tags with 'routerlinkactive' attribute

In [62]:
get_tags_with_keyword_attribute(all_links, 'routerlinkactive')

130: <a _ngcontent-iridium-us-c23="" class="track-link selected" href="/courses/all" routerlinkactive="selected">All</a>

131: <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/school-of-ai" routerlinkactive="selected">Artificial Intelligence</a>

132: <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/ai-product-manager" routerlinkactive="selected">AI Product Manager</a>

133: <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/ai-python" routerlinkactive="selected">AI Programming with Python</a>

134: <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/artificial-intelligence" routerlinkactive="selected">Artificial Intelligence</a>

135: <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/artificial-intelligence-for-trading" routerlinkactive="selected">Artificial Intelligence for Trading</a>

136: <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/computer-vision" routerlinkactive="selected">Compu

These link tags contain the various categories available. Thse would be good to keep as a separate dataset.

### Inspect link tags with 'title' attribute

In [64]:
get_tags_with_keyword_attribute(all_links, 'title')

0: <a _ngcontent-iridium-us-c1="" class="logo-link" href="/" irhybridlink="/" title="Udacity"><div _ngcontent-iridium-us-c1="" class="logo-wordmark"><svg _ngcontent-iridium-us-c1="" alt="Udacity" height="30" viewbox="0 0 180 30" width="180"><g _ngcontent-iridium-us-c1="" fill="none" fill-rule="evenodd"><path _ngcontent-iridium-us-c1="" d="M57.6 17.239187c0 2.546698-2.085517 4.407747-4.965517 4.407747-2.88 0-4.965517-1.86105-4.965517-4.407747V7.835994H45.68276v9.501143c0 3.330298 2.88 6.170846 6.951723 6.170846 4.071724 0 6.951724-2.938498 6.951724-6.170846V7.835994H57.6v9.403193zm16.286897-9.403193h-5.36276v15.67199h5.36276c4.468965 0 7.547586-3.03645 7.547586-7.835996 0-4.701596-3.07862-7.835994-7.547586-7.835994zm-.19862 13.71299h-3.177932V9.794994h3.17793c3.376553 0 5.76 2.252847 5.76 5.876994 0 3.917998-2.482757 5.779046-5.76 5.876996zm43.49793.29385c-3.575172 0-5.95862-2.644648-5.95862-6.170846 0-3.526197 2.482758-6.072895 5.95862-6.072895 2.78069 0 4.468966 1.5672 4.468966 1.5672

We don't need these tag links.

### Inspect link tags with 'region' attribute

In [74]:
get_tags_with_keyword_attribute(all_links, 'region')

941: <a _ngcontent-iridium-us-c17="" class="option" href="https://cn.udacity.com" region="cn"><img _ngcontent-iridium-us-c17="" alt="China" class="option-image" deferimgsrc="/assets/iridium/images/shared/flags/icon-flag-cn.svg" irdeferimage="" src="/assets/iridium/images/shared/flags/icon-flag-cn.svg" srcset="/assets/iridium/images/shared/flags/icon-flag-cn.svg 1x"/><label _ngcontent-iridium-us-c17="" class="option-text">China</label></a>

942: <a _ngcontent-iridium-us-c17="" class="option" href="https://eu.udacity.com" region="eu"><img _ngcontent-iridium-us-c17="" alt="Europe" class="option-image" deferimgsrc="/assets/iridium/images/shared/flags/icon-flag-eu.svg" irdeferimage="" src="/assets/iridium/images/shared/flags/icon-flag-eu.svg" srcset="/assets/iridium/images/shared/flags/icon-flag-eu.svg 1x"/><label _ngcontent-iridium-us-c17="" class="option-text">Europe</label></a>



We don't need these tag links. They refer to different Udacity region websites.

### Inspect link tags with 'class' attribute

In [67]:
# grab tags with class attribute
tags_with_class = []

for tag_link in all_links:
    if 'class' in tag_link.attrs.keys():
        tags_with_class.append(tag_link)

print(f"There are {len(tags_with_class)} links with a class attribute")

There are 531 links with a class attribute


In [77]:
for tag in tags_with_class[-10:]:
    print(tag.attrs.keys())
    print(tag)
    print('\n')

dict_keys(['_ngcontent-iridium-us-c27', 'class', 'href'])
<a _ngcontent-iridium-us-c27="" class="button--primary btn" href="/course/what-is-programming--ud994">Learn More</a>


dict_keys(['_ngcontent-iridium-us-c17', 'class'])
<a _ngcontent-iridium-us-c17="" class="selected"><img _ngcontent-iridium-us-c17="" alt="selected-flag" class="selected-image" deferimgsrc="/assets/iridium/images/shared/flags/icon-flag-us.svg" irdeferimage="" src="/assets/iridium/images/shared/flags/icon-flag-us.svg" srcset="/assets/iridium/images/shared/flags/icon-flag-us.svg 1x"/><label _ngcontent-iridium-us-c17="" class="selected-text">United States</label></a>


dict_keys(['_ngcontent-iridium-us-c17', 'class', 'region', 'href'])
<a _ngcontent-iridium-us-c17="" class="option" href="https://cn.udacity.com" region="cn"><img _ngcontent-iridium-us-c17="" alt="China" class="option-image" deferimgsrc="/assets/iridium/images/shared/flags/icon-flag-cn.svg" irdeferimage="" src="/assets/iridium/images/shared/flags/icon-

So the tags with classes do include courses. We probably don't want to filter them out yet.

### Inspect link tags with '_ngcontent-iridium-us-c27' attribute

In [75]:
get_tags_with_keyword_attribute(all_links, '_ngcontent-iridium-us-c27')

181: <a _ngcontent-iridium-us-c27="" href="/course/java-developer-nanodegree--nd035"><!-- --><div _ngcontent-iridium-us-c27="" class="image-container ng-star-inserted" style='background-image: url("https://d20vrrgs8k4bvw.cloudfront.net/images/degrees/nd035/nd-card.png");'><div _ngcontent-iridium-us-c27="" class="image-overlay"></div></div></a>

182: <a _ngcontent-iridium-us-c27="" class="capitalize" href="/course/java-developer-nanodegree--nd035">Java Developer</a>

183: <a _ngcontent-iridium-us-c27="" class="button--primary btn" href="/course/java-developer-nanodegree--nd035">Learn More</a>

184: <a _ngcontent-iridium-us-c27="" href="/course/ai-product-manager-nanodegree--nd088"><!-- --><div _ngcontent-iridium-us-c27="" class="image-container ng-star-inserted" style='background-image: url("https://d20vrrgs8k4bvw.cloudfront.net/images/degrees/nd088/nd-card.png");'><div _ngcontent-iridium-us-c27="" class="image-overlay"></div></div></a>

185: <a _ngcontent-iridium-us-c27="" class="capit

**Observations**:

We observe there is some duplication here that we can filter out by looking at the class type.

**Summary**

In summary, we will drop tag links which contain any of the following attributes:

- title
- region
- rel
- style
- id
- target

And we will keep

- class
- routerlinkactive

### Drop unneeded tags

In [69]:
tag_links_to_keep = []

print(f'Before - {len(all_links)} total links available\n')

for tag_link in all_links:
    if 'title' in tag_link.attrs.keys():
        continue
    if 'id' in tag_link.attrs.keys():
        continue
    if 'target' in tag_link.attrs.keys():
        continue
    if 'style' in tag_link.attrs.keys():
        continue
    tag_links_to_keep.append(tag_link)

print(f'After: - {len(tag_links_to_keep)} total links kept')

Before - 949 total links available

After: - 929 total links kept


### Get unique class tags

We observed earlier that some class tags may not be needed for our purposes. Let's get all the unique class tags:

In [83]:
unique_classes_count = dict()

print(f'Before - {len(tag_links_to_keep)} total links available\n')

for i, tag_link in enumerate(tag_links_to_keep):
    if 'class' in tag_link.attrs.keys():
        class_value = '^'.join(tag_link['class'])
        #print(tag_link['class'],'\t', class_value)
        #print(i, tag_link, '\n')
        unique_classes_count[class_value] = unique_classes_count.get(class_value, 0) + 1

print(f'\n{len(unique_classes_count)} Unique Classes Frequency')
for key, value in unique_classes_count.items():
    print(f"{value:<4}: {key}")

Before - 929 total links available


8 Unique Classes Frequency
2   : button^button--primary
1   : nav-toggle
1   : track-link^selected
50  : track-link
230 : capitalize
230 : button--primary^btn
1   : selected
2   : option


### Class 'track-link'

In [88]:
for i, tag_link in enumerate(tag_links_to_keep):
    if 'class' in tag_link.attrs.keys():
        #if 'capitalize' in tag_link['class']:
        #    print(i, tag_link, '\n')
        if 'track-link' in tag_link['class']:
            print(i, tag_link, '\n')

121 <a _ngcontent-iridium-us-c23="" class="track-link selected" href="/courses/all" routerlinkactive="selected">All</a> 

122 <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/school-of-ai" routerlinkactive="selected">Artificial Intelligence</a> 

123 <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/ai-product-manager" routerlinkactive="selected">AI Product Manager</a> 

124 <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/ai-python" routerlinkactive="selected">AI Programming with Python</a> 

125 <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/artificial-intelligence" routerlinkactive="selected">Artificial Intelligence</a> 

126 <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/artificial-intelligence-for-trading" routerlinkactive="selected">Artificial Intelligence for Trading</a> 

127 <a _ngcontent-iridium-us-c23="" class="track-link" href="/courses/computer-vision" routerlinkactive="selected">Comput

The 'track-link' class is used for the categories, which we want to keep. Since these have the `routerlinkactive` tag, we will use that to grab them later.

### Classes 'capitalize' and 'btn'

In [92]:
for i, tag_link in enumerate(tag_links_to_keep[:180]):
    if 'class' in tag_link.attrs.keys():
        if 'capitalize' in tag_link['class']:
            print(i, tag_link, '\n')
        if 'btn' in tag_link['class']:
            print(i, tag_link, '\n')

173 <a _ngcontent-iridium-us-c27="" class="capitalize" href="/course/java-developer-nanodegree--nd035">Java Developer</a> 

174 <a _ngcontent-iridium-us-c27="" class="button--primary btn" href="/course/java-developer-nanodegree--nd035">Learn More</a> 

176 <a _ngcontent-iridium-us-c27="" class="capitalize" href="/course/ai-product-manager-nanodegree--nd088">AI Product Manager</a> 

177 <a _ngcontent-iridium-us-c27="" class="button--primary btn" href="/course/ai-product-manager-nanodegree--nd088">Learn More</a> 

179 <a _ngcontent-iridium-us-c27="" class="capitalize" href="/course/sensor-fusion-engineer-nanodegree--nd313">Sensor Fusion Engineer</a> 



The 'capitalize' class is the class we are looking to use to grab our courses.

In [93]:
get_tags_with_keyword_attribute(tag_links_to_keep, '_ngcontent-iridium-us-c20')

0: <a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - java-developer-nanodegree--nd035",
          "cta_message": "java-developer-nanodegree--nd035",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/java-developer-nanodegree--nd035"
        }' data-analytics-event="Navbar Link Clicked" href="/course/java-developer-nanodegree--nd035"><div _ngcontent-iridium-us-c20="" class="secondary-menu-item">Java Developer</div></a>

1: <a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - cloud-developer-nanodegree--nd9990",
          "cta_message": "cloud-developer-nanodegree--nd9990",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/cloud-develo

---

### Format Data

In [72]:
for tag_link in tag_links_to_keep[:10]:
    print(tag_link, '\n')

<a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - java-developer-nanodegree--nd035",
          "cta_message": "java-developer-nanodegree--nd035",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/java-developer-nanodegree--nd035"
        }' data-analytics-event="Navbar Link Clicked" href="/course/java-developer-nanodegree--nd035"><div _ngcontent-iridium-us-c20="" class="secondary-menu-item">Java Developer</div></a> 

<a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - cloud-developer-nanodegree--nd9990",
          "cta_message": "cloud-developer-nanodegree--nd9990",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/cloud-developer-n

In [48]:
c.get_attribute_list('data-analytics-attrs')

['{\n          "category": "Navbar Link",\n          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - ios-developer-nanodegree--nd003",\n          "cta_message": "ios-developer-nanodegree--nd003",\n          "cta_type": "link",\n          "cta_location": "/",\n          "cta_destination": "course/ios-developer-nanodegree--nd003"\n        }']

In [49]:
c.get_attribute_list('data-analytics-event')

['Navbar Link Clicked']

In [50]:
c.attrs

{'_ngcontent-iridium-us-c20': '',
 'data-analytics-event': 'Navbar Link Clicked',
 'data-analytics-attrs': '{\n          "category": "Navbar Link",\n          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - ios-developer-nanodegree--nd003",\n          "cta_message": "ios-developer-nanodegree--nd003",\n          "cta_type": "link",\n          "cta_location": "/",\n          "cta_destination": "course/ios-developer-nanodegree--nd003"\n        }',
 'href': '/course/ios-developer-nanodegree--nd003'}

In [52]:
c.attrs.keys()

dict_keys(['_ngcontent-iridium-us-c20', 'data-analytics-event', 'data-analytics-attrs', 'href'])

In [53]:
type(c)

bs4.element.Tag

In [54]:
course_titles[:10]

[<a _ngcontent-iridium-us-c1="" class="logo-link" href="/" irhybridlink="/" title="Udacity"><div _ngcontent-iridium-us-c1="" class="logo-wordmark"><svg _ngcontent-iridium-us-c1="" alt="Udacity" height="30" viewbox="0 0 180 30" width="180"><g _ngcontent-iridium-us-c1="" fill="none" fill-rule="evenodd"><path _ngcontent-iridium-us-c1="" d="M57.6 17.239187c0 2.546698-2.085517 4.407747-4.965517 4.407747-2.88 0-4.965517-1.86105-4.965517-4.407747V7.835994H45.68276v9.501143c0 3.330298 2.88 6.170846 6.951723 6.170846 4.071724 0 6.951724-2.938498 6.951724-6.170846V7.835994H57.6v9.403193zm16.286897-9.403193h-5.36276v15.67199h5.36276c4.468965 0 7.547586-3.03645 7.547586-7.835996 0-4.701596-3.07862-7.835994-7.547586-7.835994zm-.19862 13.71299h-3.177932V9.794994h3.17793c3.376553 0 5.76 2.252847 5.76 5.876994 0 3.917998-2.482757 5.779046-5.76 5.876996zm43.49793.29385c-3.575172 0-5.95862-2.644648-5.95862-6.170846 0-3.526197 2.482758-6.072895 5.95862-6.072895 2.78069 0 4.468966 1.5672 4.468966 1.5672l.

In [61]:
for i, link in enumerate(course_titles[:-10]):
    
    if 'data-analytics-attrs' in link.attrs.keys():
        print(f"\n{i}: {link}\n")
        print(f'\t{link.attrs.keys()}')


1: <a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - java-developer-nanodegree--nd035",
          "cta_message": "java-developer-nanodegree--nd035",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/java-developer-nanodegree--nd035"
        }' data-analytics-event="Navbar Link Clicked" href="/course/java-developer-nanodegree--nd035"><div _ngcontent-iridium-us-c20="" class="secondary-menu-item">Java Developer</div></a>

	dict_keys(['_ngcontent-iridium-us-c20', 'data-analytics-event', 'data-analytics-attrs', 'href'])

2: <a _ngcontent-iridium-us-c20="" data-analytics-attrs='{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - cloud-developer-nanodegree--nd9990",
          "cta_message": "cloud-developer-nanodegree--nd9990",
          

In [24]:
for k, v in link.attrs.items():
    print(f"'{k}': '{v}'")

'_ngcontent-iridium-us-c20': ''
'data-analytics-event': 'Navbar Link Clicked'
'data-analytics-attrs': '{
          "category": "Navbar Link",
          "label": "Navbar Link - Header Link Clicked - Catalog - school-of-programming - ios-developer-nanodegree--nd003",
          "cta_message": "ios-developer-nanodegree--nd003",
          "cta_type": "link",
          "cta_location": "/",
          "cta_destination": "course/ios-developer-nanodegree--nd003"
        }'
'href': '/course/ios-developer-nanodegree--nd003'


In [3]:
browser = webdriver.Chrome(executable_path=chrome_driver)
browser.get(LOGIN_URL)

delay = 30  # seconds
try:
    username = WebDriverWait(browser, delay).until(
        EC.presence_of_element_located(
            (
                By.XPATH,
                "/html/body/app-root/div/ng-component/div/div/ng-component/div/form/div[1]/input",
            )
        )
    )
    print("Login Page is ready!")
except TimeoutException:
    print("Loading Login Page took too much time!")

password = browser.find_element_by_xpath(
    "/html/body/app-root/div/ng-component/div/div/ng-component/div/form/div[2]/input"
)

username.send_keys(EMAIL)
password.send_keys(PASSWORD)

login_button = browser.find_element_by_xpath(
    "/html/body/app-root/div/ng-component/div/div/ng-component/div/form/button"
)
login_button.click()

my_products = []
next_page_exists = True
page_number = 0
xml_path = "/html/body/app-root/div/ng-component/div/mat-sidenav-container/mat-sidenav-content/account-products/div/div/mat-paginator/div/div[2]/button[3]"


while next_page_exists:
    
    try:
        myElem = WebDriverWait(browser, delay).until(
            EC.presence_of_element_located((By.XPATH, xml_path))
        )
        print(f"Products Page {page_number} is ready!")
        
    except TimeoutException:
        print(f"Loading Products Page {page_number} took too much time!")
    
    # allow us to capture new html content
    sleep(3)

    soup_level1 = BeautifulSoup(browser.page_source, "html.parser")

    for product_title in soup_level1.find_all("h5"):
        print(f"\t{product_title.text}")
        my_products.append(product_title.text)
    
    sleep(1)
    xml_path = "/html/body/app-root/div/ng-component/div/mat-sidenav-container/mat-sidenav-content/account-products/div/div/mat-paginator/div/div[2]/button[3]"

    next_page_link = browser.find_element_by_xpath(xml_path)
    
    print(f"Next Page Displayed:'{next_page_link.is_displayed()}'\tNext Page Enabled:'{next_page_link.is_enabled()}'")
    
    next_page_exists = next_page_link.is_enabled()
    
    print("\nGetting next page...")
    page_number += 1
    browser.execute_script("arguments[0].click();", next_page_link)
    
print("\n\nCompleted going through all owned products!")
print(f"You have {len(my_products)} products!")

Login Page is ready!
Products Page 0 is ready!
	Pandas Cookbook
	Django by Example [Video]
	Learning Python [Video]
	Implementing Modern DevOps
	Software Architecture with Python [Video]
	Cloud Native programming with Golang
	DevOps with Kubernetes
	Ethical Hacking for Beginners [Video]
	Hands - On Reinforcement Learning with Python [Video]
	Network Programming with Go [Video]
Next Page Displayed:'True'	Next Page Enabled:'True'

Getting next page...
Products Page 1 is ready!
	Data Analysis with Python [Video]
	Python Microservices Development
	Python Web Scraping Cookbook
	Continuous Delivery with Docker and Jenkins
	AWS Certified Developer - Associate Guide
	Industrial Cybersecurity
	Python Web Scraping - Second Edition
	Machine Learning Algorithms
	Python GUI Programming Cookbook - Second Edition
	Practical Machine Learning Cookbook
Next Page Displayed:'True'	Next Page Enabled:'True'

Getting next page...
Products Page 2 is ready!
	Machine Learning with R - Second Edition
	Learning D

In [4]:
len(my_products)

96

In [5]:
packt_pub_products = my_products

In [6]:
list_of_lists = []
for product in packt_pub_products:
    list_of_lists.append([product])

In [7]:
list_of_lists[:10]

[['Pandas Cookbook'],
 ['Django by Example [Video]'],
 ['Learning Python [Video]'],
 ['Implementing Modern DevOps'],
 ['Software Architecture with Python [Video]'],
 ['Cloud Native programming with Golang'],
 ['DevOps with Kubernetes'],
 ['Ethical Hacking for Beginners [Video]'],
 ['Hands - On Reinforcement Learning with Python [Video]'],
 ['Network Programming with Go [Video]']]

In [8]:
df = pd.DataFrame(list_of_lists)
df.columns = ['product']
df.head()

Unnamed: 0,product
0,Pandas Cookbook
1,Django by Example [Video]
2,Learning Python [Video]
3,Implementing Modern DevOps
4,Software Architecture with Python [Video]


In [9]:
df = df['product'].str.split('[', expand=True)
df.columns = ['product', 'type']
df.loc[df['type'].isnull(), 'type'] = 'Book'
df['type'] = df['type'].str.replace(']', '')
df['product'] = df['product'].str.rstrip()
df.head()

Unnamed: 0,product,type
0,Pandas Cookbook,Book
1,Django by Example,Video
2,Learning Python,Video
3,Implementing Modern DevOps,Book
4,Software Architecture with Python,Video


In [10]:
now = datetime.datetime.now()
now.strftime('%F')

'2019-08-10'

In [11]:
csv_file_name = now.strftime('%F') + '-packt-pub-products-library.csv'
print(f"Saving to {csv_file_name}")
df.to_csv(csv_file_name)

Saving to 2019-08-10-packt-pub-products-library.csv


In [12]:
!ls

2019-08-10-packt-pub-products-library.csv  products_page_source.html
app.py					   secondary_interactive_notebook.ipynb
geckodriver.log				   venv
interactive_notebook.ipynb
