# Web Scrapping App
This notebook will be converted to a Python app called scrape_mars.py

In [1]:
# Mission to Mars web scrapping demo will use the following dependencies:
# Confirm they are loaded in the Anaconda Navigator environment

# Browser
# BeautifulSoup
# Pandas

from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# chromedriver is required for splinter (I saved it in the resources folder)
executable_path = {'executable_path': 'resources/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# NOTE you should see Chrome open with a comment: "Chrome is being controlled by automated test software.""

## #Ask 01: NASA Mars News
## https://mars.nasa.gov/news/
Scrape the NASA Mars News Site and collect the latest (first) News Title and Paragraph Text. Assign the text to variables that you can reference later.

#### Example:
news_title = "NASA's Perseverance Rover Is Midway to Mars "

news_para = "Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination."

In [3]:
# chromedriver opens the url
# latest News Title and Paragraph Text from https://mars.nasa.gov/news/
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# consider technique of delay for loading the page - result should be True
# browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

In [4]:
# Convert the browser to a soup object
html = browser.html
mars_news_soup = BeautifulSoup(html, 'html.parser')

# assign selection to slide_elements (this is all the li's with class='slide')
slide_element = mars_news_soup.select_one('ul.item_list li.slide')
slide_element

<li class="slide"><div class="image_and_description_container"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="illustration of the Mars 2020 spacecraft on route to Mars" src="/system/news_items/list_view_images/8785_PIA24231-320.jpg"/></div><div class="bottom_gradient"><div><h3>NASA's Perseverance Rover Is Midway to Mars </h3></div></div></a><div class="list_text"><div class="list_date">October 27, 2020</div><div class="content_title"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self">NASA's Perseverance Rover Is Midway to Mars </a></div><div class="article_tea

In [5]:
# narrow selection to each div with class='content_title'
slide_element.find('div', class_='content_title')


<div class="content_title"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self">NASA's Perseverance Rover Is Midway to Mars </a></div>

In [6]:
# use the parent elememnt to find the first <a> tag and save it as a variable 'news_title'
news_title = slide_element.find('div', class_='').get_text()
news_title

"NASA's Perseverance Rover Is Midway to Mars "

In [7]:
# use the parent element to find the first news paragraph where the div tag has class='article_teaser_body' and save it as a variable 'news_para'
news_para = slide_element.find('div', class_='article_teaser_body').get_text()
news_para

"Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination."

In [8]:
# quit browser
browser.quit()

## Ask02: JPL Mars Space Images - Featured Image

## https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars

* Visit the url for JPL Featured Space Image.
* Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
* Make sure to find the image url to the full size .jpg image.
* Make sure to save a complete url string for this image.

#### # Example:
# Example:
featured_image_url = 'https://www.jpl.nasa.gov//spaceimages/images/largesize/PIA18273_hires.jpg'

In [10]:
# Since we closed the browser we need to open a new browser
executable_path = {'executable_path': 'resources/chromedriver'}
browser = Browser('chrome', **executable_path)

In [11]:
# Use splinter to navigate to the site and find the image url for the current featured Mars Image and assign the url string to a variable called featured_image_url
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [12]:
# We are looking for the full image of the current "Featured Image" this will require a click through of the "FULL IMAGE" button.
full_image_element = browser.find_by_id('full_image')
full_image_element.click()

In [13]:
# On the next page we want to click the more information button
browser.is_element_present_by_text('more info', wait_time=1)
more_info_element = browser.links.find_by_partial_text('more info')
more_info_element.click()

In [14]:
# Now we are on a new page with the featured image details
# Convert the browser html to a soup object
html = browser.html
image_soup = BeautifulSoup(html, 'html.parser')

In [15]:
# Locate the relative url for the image url on the page
feature_image_url_relative = image_soup.select_one('figure.lede a img').get('src')
feature_image_url_relative

'/spaceimages/images/largesize/PIA14925_hires.jpg'

In [33]:
# Create the image_url by concatenating the url ('https://www.jpl.nasa.gov/) with the relative url for the image
feature_image_url = f'https://www.jpl.nasa.gov{feature_image_url_relative}'
feature_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA14925_hires.jpg'

In [17]:
# quit browser
browser.quit()

## Ask:03 Mars Facts (Using Pandas)

## https://space-facts.com/mars/

* Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

* Use Pandas to convert the data to a HTML table string.

In [20]:
# create a Pandas DataFrame using the read_html method and grabing the first item
mars_df = pd.read_html('https://space-facts.com/mars/')[0]
mars_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [21]:
# name columns using the columns= method (see table in instructions for column titles)
mars_df.columns=('Description', 'Mars')

# use the set_index method to set index to 'Description' and set parameter inplace=True so the mars_df is changed
mars_df.set_index('Description', inplace=True)
mars_df

Unnamed: 0_level_0,Mars
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [22]:
# convert the DataFrame to html using the to_html method
mars_df.to_html('table.html')

## Ask 04: Mars Hemispheres (Using Splinter)

## https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars

* Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.

* You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.

* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

# Example:
hemisphere_image_urls = [
    {"title": "Valles Marineris Hemisphere", "img_url": "..."},
    {"title": "Cerberus Hemisphere", "img_url": "..."},
    {"title": "Schiaparelli Hemisphere", "img_url": "..."},
    {"title": "Syrtis Major Hemisphere", "img_url": "..."},
]

In [28]:
# Since we closed the browser we need to open a new browser
executable_path = {'executable_path': 'resources/chromedriver'}
browser = Browser('chrome', **executable_path)

In [29]:
# visit the USGS Astrogeology site - this should open the browser
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [30]:
# create an empty list
mars_hemisphere_image_urls = []

# Inspect the webpage to find the list of hemispheres: which is an <a> tag with class= itemlink procit-item and an h3 tag
links = browser.find_by_css('a.product-item h3')

# Iterate through all links, click the link and find the anchor tag and return the href
for i in range(len(links)):
    hemisphere = {}
    
    # (for each i) find the element and click through 
    browser.find_by_css('a.product-item h3')[i].click()
    
    # find the image anchor tag <a> and get the href
    sample_element =browser.links.find_by_text('Sample').first
    
    hemisphere['img_url'] = sample_element['href']
    
    # get title
    hemisphere['title'] = browser.find_by_css('h2.title').text
    
    # append to list
    mars_hemisphere_image_urls.append(hemisphere)
        
    # navigate backwards
    browser.back()
    

In [31]:
mars_hemisphere_image_urls

[{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [32]:
# quit the browser
browser.quit()