In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import time

# URL of page to be scraped
news_url = "https://mars.nasa.gov/news/"


In [2]:
# Splinter Chrome driver is initialized here. This code is very platform or even host dependent. So I isolate it here to instantiate the browser once for the entire notebook and will use the same instance further down.
browser = Browser('chrome', {'executable_path': '/usr/bin/chromdriver'}, headless=False)

## NASA Mars News
### Plain HTML attempt (failed)
See further below to see a better approach to scrape dynamic news. This attempt is for educational purposes, how not to do.

In [3]:
# Retrieve page with the requests module
response = requests.get(news_url)
 # Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# results are returned as an iterable list
results = soup.find_all('div', class_="slide")
len(results)

6

In [5]:
# Store all the news in a list
news = []

# Traverse through the results to extract titles and teasers
for result in results:
    try:
        # Each item in the news list is a dictionary
        item = {}
        # Identify and return teaser of the news
        item['news_p'] = result \
            .find("div", class_="rollover_description_inner") \
            .text.strip()
        # Identify and return title of the news
        item['news_title'] = result \
            .find("div", class_="content_title") \
            .find("a").text.strip()
        # Store the newly constructed news item in the list of news
        news.append(item)
    except AttributeError as e:
        print(e)

# See what we have gathered
for item in news:
    print("Title: ", item['news_title'])
    print("Teaser:", item['news_p'])
    print("")

Title:  Alabama High School Student Names NASA's Mars Helicopter
Teaser: Vaneeza Rupani's essay was chosen as the name for the small spacecraft, which will mark NASA's first attempt at powered flight on another planet.

Title:  Mars Helicopter Attached to NASA's Perseverance Rover
Teaser: The team also fueled the rover's sky crane to get ready for this summer's history-making launch.

Title:  NASA's Perseverance Mars Rover Gets Its Wheels and Air Brakes
Teaser: After the rover was shipped from JPL to Kennedy Space Center, the team is getting closer to finalizing the spacecraft for launch later this summer.

Title:  10.9 Million Names Now Aboard NASA's Perseverance Mars Rover
Teaser: As part of NASA's 'Send Your Name to Mars' campaign, they've been stenciled onto three microchips along with essays from NASA's 'Name the Rover' contest. Next stop: Mars.

Title:  Virginia Middle School Student Earns Honor of Naming NASA's Next Mars Rover
Teaser: NASA chose a seventh-grader from Virginia as

## Splinter version
It looks like the NASA news site uses dynamic content loading. So simple `requests` is too static for the task. Let's load the page in a real web browser and redo the soup.

In [6]:
# Open the URL in the browser
browser.visit(news_url)
# Wait for a while to let the browser load the content and render the news
time.sleep(2.0)
# After the sleep now the time to grab the rendered html
html = browser.html
# Test html for size to see if we got anything
len(html)

869218

In [7]:
 # Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(html, 'html.parser')
# results are returned as an iterable list
results = soup.find_all('div', class_="list_text")
len(results)

40

In [8]:
# Store all the news in a list
news = []

# Traverse through the results to extract titles and teasers
for result in results:
    try:
        # Each item in the news list is a dictionary
        item = {}
        # Identify and return teaser of the news
        item['news_p'] = result \
            .find("div", class_="article_teaser_body") \
            .text.strip()
        # Identify and return title of the news
        item['news_title'] = result \
            .find("div", class_="content_title") \
            .find("a").text.strip()
        item['news_date'] = result \
            .find("div", class_="list_date") \
            .text.strip()
        # Store the newly constructed news item in the list of news
        news.append(item)
    except AttributeError as e:
        print(e)

# See what we have gathered
for item in news:
    print("Date:  ", item['news_date'])
    print("Title: ", item['news_title'])
    print("Teaser:", item['news_p'])
    print("")

Date:   June 12, 2020
Title:  NASA's Mars Rover Drivers Need Your Help
Teaser: Using an online tool to label Martian terrain types, you can train an artificial intelligence algorithm that could improve the way engineers guide the Curiosity rover.

Date:   June  8, 2020
Title:  Three New Views of Mars' Moon Phobos
Teaser: Taken with the infrared camera aboard NASA's Odyssey orbiter, they reveal temperature variations on the small moon as it drifts into and out of Mars’ shadow.

Date:   June  2, 2020
Title:  The Extraordinary Sample-Gathering System of NASA's Perseverance Mars Rover
Teaser: Two astronauts collected Moon rocks on Apollo 11. It will take three robotic systems working together to gather up the first Mars rock samples for return to Earth.

Date:   May 26, 2020
Title:  The Detective Aboard NASA's Perseverance Rover
Teaser: An instrument called SHERLOC will, with the help of its partner WATSON, hunt for signs of ancient life by detecting organic molecules and minerals.

Date: 

## JPL Mars Space Images - Featured Image
The challenge is to click a button on the page to activate a pop-up window (a "fancy box") with the featured image and extract the source URL of that image.

In [9]:
# URL of page to be scraped
space_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
# Retrieve page with the requests module
space_html = browser.visit(space_url)
time.sleep(2.0)
img_button = browser.find_by_id('full_image').first
img_button.text

'FULL IMAGE'

In [10]:
img_button.click()
time.sleep(2.0)
fancy_image = browser.find_by_css('.fancybox-image').first
featured_image_url = fancy_image['src']
browser.find_by_css('a.fancybox-close').click()
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA13911_ip.jpg


## Mars Weather
https://twitter.com/marswxreport?lang=en


In [11]:
browser.visit("https://twitter.com/marswxreport?lang=en")
time.sleep(5.0)
weather_html = browser.html
len(weather_html)

218916

In [12]:
# Create BeautifulSoup object; parse with 'html.parser'
InSight_soup = BeautifulSoup(weather_html, 'html.parser')
# results are returned as an iterable list
InSight_spans = InSight_soup.find_all('span')
len(InSight_spans)

194

In [13]:
# Twit texts are buried deep in div hierarchy. However the text paragraphs are within span tags. And the twits we're looking for are all start with "InSight sol" and then the sol number. The latest is on the top of the list. So all we have to do is to find the first entry matching these criteria.

# Define mars_weather variable and initialize it with "Not found" string so in case if the wether twit is not found we will see that that was the case.
mars_weather = "Not found."

# Iterate through all spans on the page looking for specific string as the first 12 characters
for span in InSight_spans:
    if span.text[:12] == "InSight sol ":
        mars_weather = span.text[7:]
        # Break the loop as soon as we find the first match
        break

# Pring the result
print(mars_weather)

sol 549 (2020-06-12) low -91.8ºC (-133.3ºF) high -1.5ºC (29.3ºF)
winds from the SW at 4.5 m/s (10.1 mph) gusting to 18.2 m/s (40.6 mph)
pressure at 7.40 hPa
