In [17]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import requests 
import time
import re

In [None]:
# Set up Splinter

In [3]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\John\.wdm\drivers\chromedriver\win32\92.0.4515.107\chromedriver.exe] found in cache


In [5]:
# Visit the mars nasa new site
url = 'https://redplanetscience.com'
browser.visit(url)

# Optional delay for loading page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [6]:
# Convert the browser html to a soup object and then quit the browser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [7]:
# We'll want to assign the title and summary text to variables we'll reference later. 
# In the next empty cell, let's begin our scraping. Type the following:

slide_elem.find('div', class_='content_title')

<div class="content_title">NASA to Reveal Name of Its Next Mars Rover</div>

In [8]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

'NASA to Reveal Name of Its Next Mars Rover'

In [None]:
# Although the code block above is similar to the last, there are some differences:
# We have created a new variable for the title
# Added the get_text() method
# And we're searching within the parent element for the title.

In [None]:
# Before we can update our code, we'll need to use our DevTools to make sure we're scraping the right tag and class.
# Use the DevTools selector tool and select the article summary (teaser), then check to see which tag is highlighted.

# We know that "article_teaser_body" is the right class name, but when we search for it, there is more than one result. 
#    What now?

# That's okay. There will be many matches because there are many articles, each with a tag of <div /> and 
#  a class of article_teaser_body. We want to pull the first one on the list, not a specific one, 
#   so more than 10 results isfine. In this case, if our scraping code is too specific, 
#    we'd pull only that article summary instead of the most recent.

# Because new articles are added to the top of the list, and we only need the most recent one, 
#   our search leads us to the first article. New news only, please!

# There are two methods used to find tags and attributes with BeautifulSoup:
#   .find() is used when we want only the first class and attribute we've specified.
#   .find_all() is used when we want to retrieve all of the tags and attributes.
# For example, if we were to use .find_all() instead of .find() when pulling the summary, 
#  we would retrieve all of the summaries on the page instead of just the first one.

In [9]:
# Use the parent element to find the paragraph text 
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

"After a months-long contest among students to name NASA's newest Mars rover, the agency will reveal the winning name — and the winning student — this Thursday. "

### Featured Images

In [None]:
# The first image that pops up on the webpage is the featured image. Robin wants the full-size version of this image, 
#  so we know we'll want Splinter to click the "Full Image" button. From there, the page directs us to a slideshow. 
# It's a little closer to getting the full-size feature image, but we aren't quite there yet.

# This is a lot of clicking to get to the image we want. Let's start getting our code ready to automate all of the clicks.

In [10]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [11]:
# we want to click the full-size image button, we can go ahead and use the HTML tag in our code.

# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

# Notice the indexing chained at the end of the first line of code? 
# With this, we've stipulated that we want our browser to click the second button.

In [None]:
# We need to click the More Info button to get to the next page. 
# Let's look at the DevTools again to see what elements we can use for our scraping.

# With the new page loaded onto our automated browser, it needs to be parsed so we can continue and scrape the 
#  full-size image URL. In the next empty cell, type the following:

In [12]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [None]:
# Now we need to find the relative image URL. In our browser (make sure you're on the same page as the automated one), 
# activate your DevTools again. This time, let's find the image link for that image. This is a little more tricky.
# Remember, Robin wants to pull the most recently posted image for her web app. 

# It's important to note that the value of the src will be different every time the page is updated, 
#  so we can't simply record the current value—we would only pull that image each time the code is executed, 
#   instead of the most recent one.

# We'll use the image tag and class (<img />and fancybox-img) to build the URL to the full-size image. 

In [15]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

# We've done a lot with that single line.

# Let's break it down:
#   An img tag is nested within this HTML, so we've included it.
#   .get('src') pulls the link to the image.
# What we've done here is tell BeautifulSoup to look inside the <img /> tag for an image with a class of fancybox-image. 
# Basically we're saying, "This is where the image we want lives—use the link that's inside these tags."

'image/featured/mars2.jpg'

In [16]:
# Let's add the base URL to our code.

# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

In [None]:
# We're using an f-string for this print statement because it's a cleaner way to create print statements; 
#  they're also evaluated at run-time. This means that it, and the variable it holds, doesn't exist until the code is 
#   executed and the values are not constant. This works well for our scraping app because the data we're scraping is live 
#    and will be updated frequently.

In [None]:
# Robin has chosen to collect her data from Mars Facts (https://galaxyfacts-mars.com/), so let's visit the webpage to 
#  look at what we'll be working with. Robin already has a great photo and an article, so all she wants from this page is 
#   the table. Her plan is to display it as a table on her own web app, so keeping the current HTML table format is important.

# Let's look at the webpage again, this time using our DevTools. All of the data we want is in a <table /> tag. HTML code
#   used to create a table looks fairly complex, but it's really just breaking down and naming each component.

# Tables in HTML are basically made up of many smaller containers. The main container is the <table /> tag. 
# Inside the table is <tbody />, which is the body of the table—the headers, columns, and rows.

# <tr /> is the tag for each table row. Within that tag, the table data is stored in <td /> tags. 
# This is where the columns are established.

In [None]:
# Instead of scraping each row, or the data in each <td />, we're going to scrape the entire table with 
#   Pandas' .read_html() function.

# At the top of your Jupyter Notebook, add import pandas as pd to the dependencies and rerun the cell. 
# This way, we'll be able to use this new function without generating an error.

In [18]:
df = pd.read_html('https://galaxyfacts-mars.com')[0]
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df


# Now let's break it down:

# df = pd.read_htmldf = pd.read_html('https://galaxyfacts-mars.com')[0] With this line, we're creating a new DataFrame 
#  from the HTML table. The Pandas function read_html() specifically searches for and returns a list of tables found
#   in the HTML. By specifying an index of 0, we're telling Pandas to pull only the first table it encounters,
#    or the first item in the list. Then, it turns the table into a DataFrame.

# df.columns=['description', 'Mars', 'Earth'] Here, we assign columns to the new DataFrame for additional clarity.

# df.set_index('description', inplace=True) By using the .set_index() function, we're turning the Description column into 
#  the DataFrame's index. inplace=True means that the updated index will remain in place, without having to reassign the 
#   DataFrame to a new variable.

# Now, when we call the DataFrame, we're presented with a tidy, Pandas-friendly representation of the HTML table we were 
#  just viewing on the website.

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [19]:
# How do we add the DataFrame to a web application? Robin's web app is going to be an actual webpage. 
# Our data is live—if the table is updated, then we want that change to appear in Robin's app also.

# Thankfully, Pandas also has a way to easily convert our DataFrame back into HTML-ready code using 
#   the .to_html() function. Add this line to the next cell in your notebook and then run the code.

df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [None]:
# The result is a slightly confusing-looking set of HTML code—it's a <table /> element with a lot of nested elements.
# This means success. After adding this exact block of code to Robin's web app, the data it's storing will be presented
#   in an easy-to-read tabular format.

# Now that we've gathered everything on Robin's list, we can end the automated browsing session.
# This is an important line to add to our web app also. 
# Without it, the automated browser won't know to shut down—it will continue to listen for instructions and 
#  use the computer's resources (it may put a strain on memory or a laptop's battery if left on). 
# We really only want the automated browser to remain active while we're scraping data. 
# It's like turning off a light switch when you're ready to leave the room or home.

# In the last empty cell of Jupyter Notebook, add browser.quit() and execute that cell to end the session.

In [20]:
browser.quit()