# Scraping
- Notebook allows for Cell > Run All option to be used.

In [103]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

In [104]:
# Change executable path as needed
executable_path = {'executable_path': 'C:/bin/chromedriver'}
# browser = Browser('chrome', **executable_path)

## NASA Mars News
- Scrape https://mars.nasa.gov/news/ and collect the latest <b>News Title</b> and <b>Paragraph Text</b>.

In [105]:
# Visit url with splinter 
browser = Browser('chrome', **executable_path)
nasa_url = "https://mars.nasa.gov/news/"
browser.visit(nasa_url)

In [106]:
# Create the soup object and pass in the browser in html
html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')

# Locate each list element within the page
list_element = news_soup.find('li', class_='slide')
list_element_title = list_element.find('div', class_='content_title').get_text()
list_element_title

"NASA's Treasure Map for Water Ice on Mars"

In [107]:
# Now, get the paragraph text
list_element_paragraph = list_element.find('div', class_='article_teaser_body').get_text()
list_element_paragraph

'A new study identifies frozen water just below the Martian surface, where astronauts could easily dig it up.'

## JPL Space Images Featured Image
- Go to https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars and look for a full image of the featured space image. Using splinter's browser, navigate the site to find the <b>image_url</b>.

In [108]:
#browser = Browser('chrome', **executable_path)
jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(jpl_url)

In [109]:
# Locate and click image to get full image
# interact with elements in page: https://splinter.readthedocs.io/en/latest/elements-in-the-page.html
full_image_button = browser.find_by_id('full_image')
full_image_button.click()

In [110]:
# Currently, there is a more info button, but maybe not always
# https://splinter.readthedocs.io/en/latest/matchers.html
browser.is_element_present_by_text('more info')
# Now, there is not distinguishable id we can look for like last time
# https://splinter.readthedocs.io/en/latest/finding.html
more_info_button = browser.find_link_by_partial_text('more info')
more_info_button.click()

In [111]:
html = browser.html
jpl_soup = BeautifulSoup(html, 'html.parser')
# Find image url
img_url_figure = jpl_soup.find('figure', class_='lede')
img_url_rel = img_url_figure.find('a').get('href')
img_url_rel

'/spaceimages/images/largesize/PIA18846_hires.jpg'

In [112]:
# Create the path with the main website
# img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url = 'https://www.jpl.nasa.gov' + img_url_rel
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18846_hires.jpg'

## Mars Weather
- Visit the Mars Weather twitter account in https://twitter.com/marswxreport?lang=en and scrape the latest Mars weather tweeet.

In [113]:
# Use twitter to scrape for Mar's weather
#browser = Browser('chrome', **executable_path)
weather_url = "https://twitter.com/marswxreport?lang=en"
browser.visit(weather_url)
html = browser.html
weather_soup = BeautifulSoup(html, 'html.parser')

In [114]:
# After looking around and finding some potato cutting images
# it is best to locate the tweet made by the twitter handle @MarsWxReport

# Looking through all the div containers, we can see that they have a few attributes to aid our search
tweet = weather_soup.find('div', attrs={'class': 'tweet', 'data-screen-name': 'MarsWxReport'})
# get the text within the inner container
tweet_text = tweet.find('p', class_='tweet-text').get_text()
tweet_text

'https://www.nasa.gov/press-release/goddard/2019/mars-proton-aurora-common/\xa0…'

# Mars Facts
- Easily scrape the table from Mars Facts webpage http://space-facts.com/mars/ using Pandas, then convert into HTML table string.

In [115]:
# https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.read_html.html
# import pandas as pd

# Currently(12/12/2019), we need the first table, change if necessary
mars_df = pd.read_html("http://space-facts.com/mars/")[0]
mars_df.columns=['Description', 'Value']
mars_df.set_index('Description', inplace=True)
mars_df

Unnamed: 0_level_0,Value
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [116]:
# Now, convert to html
mars_df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Value</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

# Mars Hemisphere
- Visit the USGS Astrogeology site https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars to obtain the high resolution image for each of Mar's hemispheres. Navigate through the pages and find each of the hemispheres <b>img_url</b> and <b>title</b>.

In [124]:
#browser = Browser('chrome', **executable_path)
#hem_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
#browser.visit(hem_url)

In [125]:
# TEST
# find by css uses css syntax
#img_url = browser.find_by_css("a.product-item h3")[0]
#img_url.click()

In [126]:
# Find href from sample image anchor
#sample = browser.find_link_by_text('Sample')['href']
#sample

In [123]:
#title = browser.find_by_css('h2.title').text
#title

In [121]:
# Now, we want to create a dictionary to store all of the elements in the original hem_url
# The dictionary will contain two values 'img_url' and 'title'
# To store these, dictionaries, we will create an empty list
#browser = Browser('chrome', **executable_path)
hem_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(hem_url)

image_urls = []
# How many elements are in the page that contain what we want
links = browser.find_by_css("a.product-item h3")

for i in range(len(links)):
    data = {}
    # For each of the links, index through to click
    browser.find_by_css("a.product-item h3")[i].click()
    
    # Get the 'img_url' and 'title', first in case there are multiple samples
    sample = browser.find_link_by_text('Sample').first
    # Append to dictionary
    data['img_url'] = sample['href']
    data['title'] = browser.find_by_css("h2.title").text
         
    # Append dictionary to list
    image_urls.append(data)
    # end the loop by going back to the initial page rather than going through the url again
    browser.back()
image_urls

[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [122]:
# Quit the browser
browser.quit()

In [7]:
try:
    df = pd.read_html("http://space-facts.com/mars/")[0]
except BaseException:
    None

df.columns = ["description", "value"]
df.set_index("description", inplace=True)

# Add some bootstrap styling to <table>
print(df.to_html(classes="table table-striped"))

<table border="1" class="dataframe table table-striped">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>value</th>
    </tr>
    <tr>
      <th>description</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>
