# Interactive Notebook - 2nd Attempt

## Setup

In [2]:
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import pprint as pp
import datetime
import os

In [2]:
UDACITY_URL = "https://www.udacity.com/courses/all"
chrome_driver = "/usr/bin/chromedriver"

## Get Data from Website

In [3]:
browser = webdriver.Chrome(executable_path=chrome_driver)
browser.get(UDACITY_URL)

pop_up_xml_path = "/html/body/ir-root/ir-content/ir-autopopup-modal/ir-modal/div/div[2]/div/div[1]"

delay = 30  # seconds
try:
    popup_close_button = WebDriverWait(browser, delay).until(
        EC.presence_of_element_located(
            (
                By.XPATH, pop_up_xml_path
            )
        )
    )
    print("Course Catalog Page is ready!")
except TimeoutException:
    print("Loading Course Catalog Page took too much time!")

print("Closing pop up button")
popup_close_button.click()

Course Catalog Page is ready!
Closing pop up button


In [8]:
soup = BeautifulSoup(browser.page_source, "html.parser")

In [9]:
nanodegree_cards = soup.find_all('div', {'class': 'course-summary-card row row-gap-medium catalog-card nanodegree-card ng-star-inserted'})

len(nanodegree_cards)

38

In [1]:
course_cards = soup.find_all('div', {'class': 'course-summary-card row row-gap-medium catalog-card ng-star-inserted'})

len(course_cards)

NameError: name 'soup' is not defined

## Explore Nanodegree Cards

In [104]:
for i, nd_card in enumerate(nanodegree_cards[:2]):
    print(f"{i}: {nd_card}\n")

0: <div _ngcontent-iridium-us-c24="" class="course-summary-card row row-gap-medium catalog-card nanodegree-card ng-star-inserted"><ir-catalog-card _ngcontent-iridium-us-c24="" _nghost-iridium-us-c27=""><div _ngcontent-iridium-us-c27="" class="card-wrapper is-collapsed"><div _ngcontent-iridium-us-c27="" class="card__inner card mb-0"><div _ngcontent-iridium-us-c27="" class="card__inner--upper"><div _ngcontent-iridium-us-c27="" class="image_wrapper hidden-md-down"><a _ngcontent-iridium-us-c27="" href="/course/java-developer-nanodegree--nd035"><!-- --><div _ngcontent-iridium-us-c27="" class="image-container ng-star-inserted" style='background-image: url("https://d20vrrgs8k4bvw.cloudfront.net/images/degrees/nd035/nd-card.png");'><div _ngcontent-iridium-us-c27="" class="image-overlay"></div></div></a><!-- --></div><div _ngcontent-iridium-us-c27="" class="card-content"><!-- --><span _ngcontent-iridium-us-c27="" class="tag tag--new card ng-star-inserted">New</span><!-- --><div _ngcontent-iridi

In [73]:
print(nanodegree_cards[6].prettify())

<div _ngcontent-iridium-us-c24="" class="course-summary-card row row-gap-medium catalog-card nanodegree-card ng-star-inserted">
 <ir-catalog-card _ngcontent-iridium-us-c24="" _nghost-iridium-us-c27="">
  <div _ngcontent-iridium-us-c27="" class="card-wrapper is-collapsed">
   <div _ngcontent-iridium-us-c27="" class="card__inner card mb-0">
    <div _ngcontent-iridium-us-c27="" class="card__inner--upper">
     <div _ngcontent-iridium-us-c27="" class="image_wrapper hidden-md-down">
      <a _ngcontent-iridium-us-c27="" href="/course/intro-to-machine-learning-nanodegree--nd229">
       <!-- -->
       <div _ngcontent-iridium-us-c27="" class="image-container ng-star-inserted" style='background-image: url("https://d20vrrgs8k4bvw.cloudfront.net/images/degrees/nd229/nd-card.jpg");'>
        <div _ngcontent-iridium-us-c27="" class="image-overlay">
        </div>
       </div>
      </a>
      <!-- -->
     </div>
     <div _ngcontent-iridium-us-c27="" class="card-content">
      <!-- -->
      

In [102]:
# Helper Function

def get_nanodegree_programs_details_list(nanodegree_cards):
    '''
    returns list of lists containing info on nanodegree programs
    Each nanodegree info list contains -
        - Name
        - Relative URL Link
        - Category
        - Skills (if present)
        - Collaborators (if present)
        - Level: Beginner, Intermediate, Advanced (if present)
        - Details (if present)
    '''
    print("Getting Nanodegree program details list...")
    all_nd_info_list = []
    for i, nd_card in enumerate(nanodegree_cards):
        #print(f"{i}: {nd_card.text}\n")

        # find course links
        nd_info = []

        nd_course_link = nd_card.find('a', {'class': 'capitalize'})
        nd_name = nd_course_link.text
        nd_info.append(nd_name)

        nd_link = nd_course_link.get('href')
        nd_info.append(nd_link)

        nd_category = nd_card.find('h4', {'class': 'category ng-star-inserted'}).text.strip()
        nd_info.append(nd_category)

        # skills
        skills_section = nd_card.find('div', {'class': 'skills ng-star-inserted'})
        if skills_section:
            skills_list = skills_section.find_all('span', {'class': 'ng-star-inserted'})
            skills = [skill.text for skill in skills_list]
            nd_info.append(''.join(skills))
        else:
            nd_info.append('')

        # collaborators
        collaborators_section = nd_card.find('div', {'class': 'hidden-sm-down ng-star-inserted'})
        if collaborators_section:
            #print('\n', collaborators.prettify())
            collaborators_list = collaborators_section.find_all('span', {'class': 'ng-star-inserted'})
            collaborators = [collaborator.text for collaborator in collaborators_list]
            #print(collaborators)
            nd_info.append(''.join(collaborators))
        else:
            nd_info.append('')

        # nd level
        right_section = nd_card.find('div', {'class': 'right'})
        if right_section:
            nd_level = right_section.text.capitalize()
            nd_info.append(nd_level)
        else:
            nd_info.append('')

        # nd details
        details_section = nd_card.find('div', {'class': 'card__expander'}).find('span', {'class': 'ng-star-inserted'})
        if details_section:
            nd_details = details_section.text
            nd_info.append(nd_details)
        else:
            nd_info.append('')

        #print(f"{i}: {'|'.join(nd_info)}")
        all_nd_info_list.append(nd_info)
    print("Completed getting Nanodegree program details list...")
    return all_nd_info_list

In [103]:
nd_infos_list = get_nanodegree_programs_details_list(nanodegree_cards)

nd_infos_list[:2]

Getting Nanodegree program details list...
Completed getting Nanodegree program details list...


[['Java Developer',
  '/course/java-developer-nanodegree--nd035',
  'School of Programming',
  'Java, Spring Boot, Rest API, MySQL, MongoDB',
  '',
  'Intermediate',
  'Learn back-end development with the Java programming language'],
 ['AI Product Manager',
  '/course/ai-product-manager-nanodegree--nd088',
  'School of Artificial Intelligence',
  'AI Products, Training ML Models, Annotating Datasets, Prototyping a Product',
  'Figure Eight',
  'Beginner',
  'Learn to develop AI products that deliver business value. Build skills that help you compete in the new AI-powered world.']]

---

## Explore Course Cards

In [127]:
for i, course_card in enumerate(course_cards[:2]):
    print(f"{i}: {course_card}\n")

0: <div _ngcontent-iridium-us-c24="" class="course-summary-card row row-gap-medium catalog-card ng-star-inserted"><ir-catalog-card _ngcontent-iridium-us-c24="" _nghost-iridium-us-c27=""><div _ngcontent-iridium-us-c27="" class="card-wrapper is-collapsed"><div _ngcontent-iridium-us-c27="" class="card__inner card mb-0"><div _ngcontent-iridium-us-c27="" class="card__inner--upper"><div _ngcontent-iridium-us-c27="" class="image_wrapper hidden-md-down"><a _ngcontent-iridium-us-c27="" href="/course/self-driving-car-fundamentals-featuring-apollo--ud0419"><!-- --><div _ngcontent-iridium-us-c27="" class="image-container ng-star-inserted" style='background-image: url("https://d20vrrgs8k4bvw.cloudfront.net/images/courses/thumbnails/ud0419_thumbnail.jpg");'><div _ngcontent-iridium-us-c27="" class="image-overlay"></div></div></a><!-- --></div><div _ngcontent-iridium-us-c27="" class="card-content"><!-- --><span _ngcontent-iridium-us-c27="" class="tag tag--new card ng-star-inserted">New</span><!-- --><

In [128]:
print(course_cards[0].prettify())

<div _ngcontent-iridium-us-c24="" class="course-summary-card row row-gap-medium catalog-card ng-star-inserted">
 <ir-catalog-card _ngcontent-iridium-us-c24="" _nghost-iridium-us-c27="">
  <div _ngcontent-iridium-us-c27="" class="card-wrapper is-collapsed">
   <div _ngcontent-iridium-us-c27="" class="card__inner card mb-0">
    <div _ngcontent-iridium-us-c27="" class="card__inner--upper">
     <div _ngcontent-iridium-us-c27="" class="image_wrapper hidden-md-down">
      <a _ngcontent-iridium-us-c27="" href="/course/self-driving-car-fundamentals-featuring-apollo--ud0419">
       <!-- -->
       <div _ngcontent-iridium-us-c27="" class="image-container ng-star-inserted" style='background-image: url("https://d20vrrgs8k4bvw.cloudfront.net/images/courses/thumbnails/ud0419_thumbnail.jpg");'>
        <div _ngcontent-iridium-us-c27="" class="image-overlay">
        </div>
       </div>
      </a>
      <!-- -->
     </div>
     <div _ngcontent-iridium-us-c27="" class="card-content">
      <!-- -

In [134]:
# Helper Function

def get_course_programs_details_list(course_cards):
    '''
    returns list of lists containing info on course programs
    Each course info list contains -
        - Name
        - Relative URL Link
        - Category (if present)
        - Skills (if present)
        - Collaborators (if present)
        - Level: Beginner, Intermediate, Advanced (if present)
        - Details (if present)
    '''
    print("Getting Course program details list...")
    all_courses_info_list = []
    for i, course_card in enumerate(course_cards):
        course_info = []

        # course name and link
        course_link = course_card.find('a', {'class': 'capitalize'})
        course_name = course_link.text
        course_info.append(course_name)

        course_link = course_link.get('href')
        course_info.append(course_link)

        
        # category
        category = course_card.find('h4', {'class': 'category ng-star-inserted'})
        if category:
            category = category.text.strip()
            course_info.append(category)
        else:
            course_info.append('')

            
        # skills
        skills_section = course_card.find('div', {'class': 'skills ng-star-inserted'})
        if skills_section:
            skills_list = skills_section.find_all('span', {'class': 'ng-star-inserted'})
            skills = [skill.text for skill in skills_list]
            course_info.append(''.join(skills))
        else:
            course_info.append('')


        # collaborators
        collaborators_section = course_card.find('div', {'class': 'hidden-sm-down ng-star-inserted'})
        if collaborators_section:
            #print('\n', collaborators.prettify())
            collaborators_list = collaborators_section.find_all('span', {'class': 'ng-star-inserted'})
            collaborators = [collaborator.text for collaborator in collaborators_list]
            #print(collaborators)
            course_info.append(''.join(collaborators))
        else:
            course_info.append('')


        # course level
        right_section = course_card.find('div', {'class': 'right'})
        if right_section:
            course_level = right_section.text.capitalize()
            course_info.append(course_level)
        else:
            course_info.append('')

        
        # course details
        details_section = course_card.find('div', {'class': 'card__expander'}).find('span', {'class': 'ng-star-inserted'})
        if details_section:
            course_details = details_section.text
            course_info.append(course_details)
        else:
            course_info.append('')

        all_courses_info_list.append(course_info)
        
    print("Completed getting Course program details list...")
    return all_courses_info_list

In [135]:
courses_infos_list = get_course_programs_details_list(course_cards)

courses_infos_list[:2]

Getting Course program details list...
Completed getting Course program details list...


[['Self-Driving Fundamentals: Featuring Apollo ',
  '/course/self-driving-car-fundamentals-featuring-apollo--ud0419',
  'School of Autonomous Systems',
  'Apollo HD Map, Localization, Perception, Prediction, Planning, Control',
  'Baidu',
  'Beginner',
  'Identify key parts of self-driving cars, utilize Apollo HD Map, localization, perception, prediction, planning and control, and start the learning path of building a self-driving car. '],
 ['Tales from the Genome',
  '/course/tales-from-the-genome--bio110',
  '',
  'Genetics, DNA, Gene regulation',
  '23andMe',
  'Beginner',
  'Learn the basics of genetics, with a personal twist. This class is all about DNA and how it shapes who we are.']]

---

**Observations**

The same code can be use to extract the details for both Nanodegrees and Courses. So let's build a generic function that can be utilized for both.

In [159]:
def get_programs_details_list(course_cards):
    '''
    returns list of lists containing info on programs
    Each program info list contains -
        - Name
        - Course Type
        - Relative URL Link
        - Category (if present)
        - Skills (if present)
        - Collaborators (if present)
        - Level: Beginner, Intermediate, Advanced (if present)
        - Details (if present)
    '''
    print("Getting program details list...")
    all_courses_info_list = []
    for i, course_card in enumerate(course_cards):
        course_info = []

        # course name, type and link
        course_link = course_card.find('a', {'class': 'capitalize'})
        course_name = course_link.text
        course_info.append(course_name)

        course_link = course_link.get('href')
        
        if '--nd' in course_link.lower() or 'nanodegree' in course_link.lower():
            course_type = 'nanodegree'
        else:
            course_type = 'course'
        
        course_info.append(course_type)
        course_info.append(course_link)

        
        # category
        category = course_card.find('h4', {'class': 'category ng-star-inserted'})
        if category:
            category = category.text.strip()
            course_info.append(category)
        else:
            course_info.append('')

            
        # skills
        skills_section = course_card.find('div', {'class': 'skills ng-star-inserted'})
        if skills_section:
            skills_list = skills_section.find_all('span', {'class': 'ng-star-inserted'})
            skills = [skill.text for skill in skills_list]
            course_info.append(''.join(skills))
        else:
            course_info.append('')


        # collaborators
        collaborators_section = course_card.find('div', {'class': 'hidden-sm-down ng-star-inserted'})
        if collaborators_section:
            collaborators_list = collaborators_section.find_all('span', {'class': 'ng-star-inserted'})
            collaborators = [collaborator.text for collaborator in collaborators_list]
            course_info.append(''.join(collaborators))
        else:
            course_info.append('')


        # course level
        right_section = course_card.find('div', {'class': 'right'})
        if right_section:
            course_level = right_section.text.capitalize()
            course_info.append(course_level)
        else:
            course_info.append('')

        
        # course details
        details_section = course_card.find('div', {'class': 'card__expander'}).find('span', {'class': 'ng-star-inserted'})
        if details_section:
            course_details = details_section.text
            course_info.append(course_details)
        else:
            course_info.append('')

        all_courses_info_list.append(course_info)
        
    print("Completed getting program details list...")
    return all_courses_info_list

In [160]:
nd_infos_list = get_programs_details_list(nanodegree_cards)

courses_infos_list = get_programs_details_list(course_cards)

Getting program details list...
Completed getting program details list...
Getting program details list...
Completed getting program details list...


In [161]:
nd_infos_list[:2]

[['Java Developer',
  'nanodegree',
  '/course/java-developer-nanodegree--nd035',
  'School of Programming',
  'Java, Spring Boot, Rest API, MySQL, MongoDB',
  '',
  'Intermediate',
  'Learn back-end development with the Java programming language'],
 ['AI Product Manager',
  'nanodegree',
  '/course/ai-product-manager-nanodegree--nd088',
  'School of Artificial Intelligence',
  'AI Products, Training ML Models, Annotating Datasets, Prototyping a Product',
  'Figure Eight',
  'Beginner',
  'Learn to develop AI products that deliver business value. Build skills that help you compete in the new AI-powered world.']]

In [162]:
courses_infos_list[:2]

[['Self-Driving Fundamentals: Featuring Apollo ',
  'course',
  '/course/self-driving-car-fundamentals-featuring-apollo--ud0419',
  'School of Autonomous Systems',
  'Apollo HD Map, Localization, Perception, Prediction, Planning, Control',
  'Baidu',
  'Beginner',
  'Identify key parts of self-driving cars, utilize Apollo HD Map, localization, perception, prediction, planning and control, and start the learning path of building a self-driving car. '],
 ['Tales from the Genome',
  'course',
  '/course/tales-from-the-genome--bio110',
  '',
  'Genetics, DNA, Gene regulation',
  '23andMe',
  'Beginner',
  'Learn the basics of genetics, with a personal twist. This class is all about DNA and how it shapes who we are.']]

In [164]:
all_courses_list = []

all_courses_list.extend(nd_infos_list)
all_courses_list.extend(courses_infos_list)

print(f"There are {len(all_courses_list)} programs!")

There are 230 programs!


In [166]:
all_courses_list[:5]

[['Java Developer',
  'nanodegree',
  '/course/java-developer-nanodegree--nd035',
  'School of Programming',
  'Java, Spring Boot, Rest API, MySQL, MongoDB',
  '',
  'Intermediate',
  'Learn back-end development with the Java programming language'],
 ['AI Product Manager',
  'nanodegree',
  '/course/ai-product-manager-nanodegree--nd088',
  'School of Artificial Intelligence',
  'AI Products, Training ML Models, Annotating Datasets, Prototyping a Product',
  'Figure Eight',
  'Beginner',
  'Learn to develop AI products that deliver business value. Build skills that help you compete in the new AI-powered world.'],
 ['Sensor Fusion Engineer',
  'nanodegree',
  '/course/sensor-fusion-engineer-nanodegree--nd313',
  'School of Autonomous Systems',
  'Perception, Lidar, Radar, Sensors, Computer Vision',
  'Mercedes',
  'Advanced',
  'Learn to fuse lidar point clouds, radar signatures, and camera images using Kalman Filters to perceive the environment and detect and track vehicles and pedestri

In [168]:
df = pd.DataFrame(all_courses_list)
df.columns = ['name', 'type', 'link', 'category', 'skills', 'collaborators', 'level', 'detail']
df.head()

Unnamed: 0,name,type,link,category,skills,collaborators,level,detail
0,Java Developer,nanodegree,/course/java-developer-nanodegree--nd035,School of Programming,"Java, Spring Boot, Rest API, MySQL, MongoDB",,Intermediate,Learn back-end development with the Java progr...
1,AI Product Manager,nanodegree,/course/ai-product-manager-nanodegree--nd088,School of Artificial Intelligence,"AI Products, Training ML Models, Annotating Da...",Figure Eight,Beginner,Learn to develop AI products that deliver busi...
2,Sensor Fusion Engineer,nanodegree,/course/sensor-fusion-engineer-nanodegree--nd313,School of Autonomous Systems,"Perception, Lidar, Radar, Sensors, Computer Vi...",Mercedes,Advanced,"Learn to fuse lidar point clouds, radar signat..."
3,Data Visualization,nanodegree,/course/data-visualization-nanodegree--nd197,School of Data Science,"Data Visualization, Tableau, Dashboards, Data ...",,Beginner,"Combine data, visuals, and narrative to tell i..."
4,Cloud Developer,nanodegree,/course/cloud-developer-nanodegree--nd9990,School of Cloud Computing,"AWS, Microservices, Serverless Architecture, K...",,Intermediate,Cloud development is the foundation for the ne...


In [169]:
now = datetime.datetime.now()
now.strftime('%F')

'2019-08-10'

In [170]:
csv_file_name = now.strftime('%F') + '-udacity-course-catalog.csv'
print(f"Saving to {csv_file_name}")
df.to_csv(csv_file_name, sep='|')

Saving to 2019-08-10-udacity-course-catalog.csv


In [171]:
!ls

2019-08-10-udacity-course-catalog.csv
get_udacity_catalog.py
interactive_notebook.ipynb
interactive_notebook-second-attempt.ipynb
requirements.txt
venv
