# Web Scrapping App
This notebook will be converted to a Python app called scrape_mars.py

In [24]:
# Mission to Mars web scrapping demo will use the following dependencies:
# Browser (from splinter)
# BeautifulSoup (from bs4)
# Pandas

# Confirm they are loaded in your base Anaconda Navigator environment

from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import re

### Setup the splinter Browser in order to scrape. The augument headless=False means you will see the Browser as it visits the url and scrapes

#### Splinter Browser documentation is availalbe at https://splinter.readthedocs.io/en/latest/

In [2]:
# # Setup splinter, Note: chromedriver is required for splinter
# my path to chromedrive from this notebook is up from the Jupyter-notebook folder to the appp folder then into the static folder
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False) # below we will refer to browser below and add Browser methods to identify CSS selectors

# NOTE you should see Chrome open with a comment: "Chrome is being controlled by automated test software.""

## #Ask 01: NASA Mars News
## https://mars.nasa.gov/news/
Scrape the NASA Mars News Site and collect the latest (first) News Title and Paragraph Text. Assign the text to variables that you can reference later.

#### Example:
news_title = "NASA's Perseverance Rover Is Midway to Mars "

news_para = "Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination."

In [3]:
# chromedriver opens the url
# latest News Title and Paragraph Text from https://mars.nasa.gov/news/
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# consider technique of delay for loading the page - result should be True
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

##### Setup BeautifulSoup - documentation is availalbe at https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [4]:
# Convert the browser to a soup object. This allows us to then use that object with other BeautifulSoup methods like select_one and find 
html = browser.html
mars_news_soup = BeautifulSoup(html, 'html.parser')

### Open the url you are scraping and use "inspect" from the developer tools to view the webpage Document Object Model (DOM)
#### Find the News title and News paragraph.

In [5]:
# Using inspect reveals that the image, date, title and paragraph are all within a list item with the class=slide
# assign a variable (slide_elements) and use select_one(), which finds only the first tag that matches a selector: 
# in case the CSS selector is under the unordered list (ul) with class=item_list and the list item (li) with class='slide'

slide_element = mars_news_soup.select_one('ul.item_list li.slide')
slide_element

<li class="slide"><div class="image_and_description_container"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="illustration of the Mars 2020 spacecraft on route to Mars" src="/system/news_items/list_view_images/8785_PIA24231-320.jpg"/></div><div class="bottom_gradient"><div><h3>NASA's Perseverance Rover Is Midway to Mars </h3></div></div></a><div class="list_text"><div class="list_date">October 27, 2020</div><div class="content_title"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self">NASA's Perseverance Rover Is Midway to Mars </a></div><div class="article_tea

In [6]:
# narrow the selection to each div with class='content_title'
content_title = slide_element.find('div', class_='content_title')
content_title

<div class="content_title"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self">NASA's Perseverance Rover Is Midway to Mars </a></div>

In [7]:
# use the parent elememnt to find the first <a> tag and save it as a variable 'news_title'
news_title = content_title.get_text()
news_title

"NASA's Perseverance Rover Is Midway to Mars "

In [8]:
# use the parent element to find the first news paragraph where the div tag has class='article_teaser_body' and save it as a variable 'news_para'
img_desc_container = slide_element.find('div', class_='image_and_description_container')
img_desc_container

<div class="image_and_description_container"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="illustration of the Mars 2020 spacecraft on route to Mars" src="/system/news_items/list_view_images/8785_PIA24231-320.jpg"/></div><div class="bottom_gradient"><div><h3>NASA's Perseverance Rover Is Midway to Mars </h3></div></div></a><div class="list_text"><div class="list_date">October 27, 2020</div><div class="content_title"><a href="/news/8785/nasas-perseverance-rover-is-midway-to-mars/" target="_self">NASA's Perseverance Rover Is Midway to Mars </a></div><div class="article_teaser_body">Sometime

In [9]:
# use the parent element to find the first news paragraph where the div tag has class='article_teaser_body' and save it as a variable 'news_para'
news_para = img_desc_container.get_text()
news_para

"Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination.NASA's Perseverance Rover Is Midway to Mars October 27, 2020NASA's Perseverance Rover Is Midway to Mars Sometimes half measures can be a good thing – especially on a journey this long. The agency's latest rover only has about 146 million miles left to reach its destination."

In [10]:
# use the parent element to find the image for the news story
news_img = img_desc_container.find('div', class_='list_image')
news_img

<div class="list_image"><img alt="illustration of the Mars 2020 spacecraft on route to Mars" src="/system/news_items/list_view_images/8785_PIA24231-320.jpg"/></div>

In [11]:
news_img_title = news_img.find('img', class_='').get('alt')
news_img_title

'illustration of the Mars 2020 spacecraft on route to Mars'

In [12]:
news_img_src = news_img.find('img', class_='').get('src')
news_img_src

'/system/news_items/list_view_images/8785_PIA24231-320.jpg'

In [13]:
# Create the new_image_url by concatenating the url with the news_img_url_relative
news_img_src = f"https://mars.nasa.gov{news_img_src}"
news_img_src

'https://mars.nasa.gov/system/news_items/list_view_images/8785_PIA24231-320.jpg'

In [14]:
# quit browser
browser.quit()

## Ask02: JPL Mars Space Images - Featured Image

## https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars

* Visit the url for JPL Featured Space Image.
* Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
* Make sure to find the image url to the full size .jpg image.
* Make sure to save a complete url string for this image.

#### # Example:
# Example:
featured_image_url = 'https://www.jpl.nasa.gov//spaceimages/images/largesize/PIA18273_hires.jpg'

In [15]:
# Since we closed the browser we need to open a new browser
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

#### Use splinter Browser (we assigned this to the variable browser above) to visit the sight and click through to find the items we want to scrape

In [16]:
# Use splinter to navigate to the site and find the image url for the current featured Mars Image and assign the url string to a variable called featured_image_url
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [17]:
# We are looking for the full image of the current "Featured Image" this will require a click through of the "FULL IMAGE" button.
full_image_element = browser.find_by_id('full_image')
full_image_element.click()

In [18]:
# On the next page we want to click the more information button
browser.is_element_present_by_text('more info', wait_time=1)
more_info_element = browser.links.find_by_partial_text('more info')
more_info_element.click()

In [19]:
# Now we are on a new page with the featured image details
# Convert the browser html to a soup object
html = browser.html
image_soup = BeautifulSoup(html, 'html.parser')

In [20]:
# Locate the relative url for the image url on the page
feature_image_url_relative = image_soup.select_one('figure.lede a img').get('src')
feature_image_url_relative

'/spaceimages/images/largesize/PIA14944_hires.jpg'

In [21]:
# Create the image_url by concatenating the url ('https://www.jpl.nasa.gov/) with the relative url for the complete url for the featured image 
feature_image_url = f'https://www.jpl.nasa.gov{feature_image_url_relative}'
feature_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA14944_hires.jpg'

In [31]:
# Locate the title of the featured image
feature_title = image_soup.select_one('h1.article_title').string
feature_title

'\n\t\t\t\tThe Rose\t\t\t  '

In [35]:
# remove all newlines (n), returnns (r), and tabs (t) form the title (string)
feature_img_title  = re.sub(r"[\n\t\r]*", "", feature_title)

# use the python .strip method to remove leading and trailing spaces from the feature_img_title
feature_img_title = feature_img_title.strip()

feature_img_title

'The Rose'

In [None]:
# quit browser
browser.quit()

## Ask:03 Mars Facts (Using Pandas to scrape a table)

## https://space-facts.com/mars/

* Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

* Use Pandas to convert the data to a HTML table string.

#### Example
mars_facts = ```<table border="1" class="dataframe table table-hover table-dark">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>```

In [None]:
# create a Pandas DataFrame using the read_html method and grabing the first table from the https://space-facts.com/mars/ site
mars_df = pd.read_html('https://space-facts.com/mars/')[0]
mars_df

In [None]:
# name columns using the columns= method (project instructions show column names are Description and Mars)
mars_df.columns=('Description', 'Mars')

# use the set_index method to set index to 'Description' and set parameter inplace=True so the mars_df is changed
mars_df.set_index('Description', inplace=True)
mars_df

In [None]:
# convert the DataFrame to html using the to_html method and assign output to a file called table.html
mars_df.to_html('table.html')

# open table.html in VS Code to view html code to build the default table when no additional auguments are passed
# you can also view the default styled table by opening table.html in jupterLab

In [None]:
# additional auguments can be passed to the .to_html method to include Bootstrap styling 
# add the following Bootstrap classes as auguments (table table-hover table-dark)
# note when no output file name is passed as an augument, the output is the html code to build the table (with the classes included)

mars_df.to_html(classes="table table-hover table-dark")

## Ask 04: Mars Hemispheres (Using Splinter)

## https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars

* Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.

* You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.

* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

# Example:
hemisphere_image_urls = [
    {"title": "Valles Marineris Hemisphere", "img_url": "..."},
    {"title": "Cerberus Hemisphere", "img_url": "..."},
    {"title": "Schiaparelli Hemisphere", "img_url": "..."},
    {"title": "Syrtis Major Hemisphere", "img_url": "..."},
]

In [None]:
# Since we closed the browser we need to open a new browser
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

In [None]:
# visit the USGS Astrogeology site - this should open the browser
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [None]:
# create an empty list
mars_hemisphere_image_urls = []

# Inspect the webpage to find the list of hemispheres: which is an <a> tag with class= itemlink and product-item and an h3 tag
links = browser.find_by_css('a.product-item h3')

# Iterate through all links, click the link and find the anchor tag and return the href
for i in range(len(links)):
    hemisphere = {}
    
    # (for each i) find the element and click through 
    browser.find_by_css('a.product-item h3')[i].click()
    
    # locate the link to the downloadable jpg file (seen on the page witht the text "Sample") and find the image anchor tag <a> and get the href
    sample_element =browser.links.find_by_text('Sample').first
    
    hemisphere['img_url'] = sample_element['href']
    
    # get title
    hemisphere['title'] = browser.find_by_css('h2.title').text
    
    # get paragraph
    hemisphere['para'] = browser.find_by_css('p').text
    
    # append to list
    mars_hemisphere_image_urls.append(hemisphere)
        
    # navigate backwards and start over for the next hemisphere
    browser.back()
    

In [None]:
mars_hemisphere_image_urls

In [None]:
# quit the browser
browser.quit()