In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import time

In [2]:
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    #Note: had to remove executable path due to issue with the way chromedriver is loaded.  
    #It creates 2 versions of the app, which causes conflict when running.  If running on local PC, uncomment executable_path
    # and add **executable_path between "chrome" and headless
    return Browser("chrome", headless=False)


In [3]:
#Mars News Data
#Scrape the first news article https://mars.nasa.gov/news and get the first result
def scrape():
    browser = init_browser()

    # Visit https://mars.nasa.gov/news
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    #give the page time to load
    time.sleep(1)

    # Get the first new titles
    list_title= soup.select_one("ul.item_list li.slide")
    title = list_title.find("div", class_="content_title").get_text()
 
    # Get the first articles teaser body content
    content = list_title.find("div", class_="article_teaser_body").get_text()

    # Store data in a dictionary
    mars_news = {
        "news_title": title,
        "mars_p": content
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_news

scrape()

{'news_title': "Robotic Toolkit Added to NASA's Mars 2020 Rover",
 'mars_p': "The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover. "}

In [4]:
#JPL Mars Space Images
# Get the image path for the article
def scrape():
    browser = init_browser()

    # Visit https://mars.nasa.gov/news
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    img_url = "https://www.jpl.nasa.gov"
    browser.visit(url)

    #give the page time to load
    time.sleep(1)
    
    #step through the pages to get to the page with the image
    browser.click_link_by_partial_text('FULL IMAGE')
    
    #give the page time to load
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    # Get the first new titles
    image = soup.select_one("figure.lede a img").get("src")
    
    featured_img_url = img_url + image
    
    browser.quit()
    
    return featured_img_url
    
scrape()

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17936_hires.jpg'

In [22]:
#Mars Weather
#Scrape the latest tweet about Mars weather
def scrape():
    browser = init_browser()

    # Visit https://mars.nasa.gov/news
    url = "https://twitter.com/marswxreport?lang-en"
    browser.visit(url)

    #give the page time to load
    time.sleep(1)

    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    # Get the first new titles
    mars_weather = soup.select_one("div.js-tweet-text-container p").get_text()

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_weather

scrape()

'InSight sol 258 (2019-08-18) low -100.0ºC (-148.1ºF) high -26.2ºC (-15.2ºF)\nwinds from the SSE at 5.3 m/s (11.9 mph) gusting to 16.8 m/s (37.6 mph)\npressure at 7.60 hPapic.twitter.com/5nCVjcsmlZ'

In [5]:
# Mars Facts
#give pandas the URL
url = 'https://space-facts.com/mars'
tables = pd.read_html(url)[1]
tables.head()

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"


In [8]:
tables.columns = ['Description', 'Value']
space_facts = tables.iloc[1:]
space_facts.set_index('Description', inplace=True)
space_facts.head()

Unnamed: 0_level_0,Value
Description,Unnamed: 1_level_1
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)


In [41]:
#HEMISPHERES
# Get the image path for the article
def scrape():
    browser = init_browser()

    # Visit https://mars.nasa.gov/news
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    
    #step through the pages to get to the page with the image
    time.sleep(1)
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    time.sleep(1)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    #scrap the page, gathering the image title and url
    title = soup.select_one("div.content h2.title").get_text()
    img_url = soup.select_one("div.downloads ul li a").get("href")
    
    # Store data in a dictionary
    hemisphere = {
        "title": title,
        "url": img_url
    }
    
    # Close the browser after scraping
    browser.quit()
    
    return hemisphere
    
scrape()

{'title': 'Cerberus Hemisphere Enhanced',
 'url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}

In [9]:
#HEMISPHERES
# Get the image path for the article
def scrape():
    browser = init_browser()

    #create list of links to click through with browser.click_link_by_partial_text
    sites = ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']
    
    hemisphere_image_url = []
    
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    
    for site in sites:
        try:
            # Visit https://mars.nasa.gov/news    
            browser.click_link_by_partial_text(site)
            time.sleep(1)

            # Scrape page into Soup
            html = browser.html
            soup = bs(html, "html.parser")

            #scrap the page, gathering the image title and url
            title = soup.find("h2", class_="title").get_text()
            img_url = soup.find("a", text="Sample").get("href")

            # Store data in a dictionary
            hemisphere = {
                "title": title,
                "img_url": img_url
                }
            hemisphere_image_url.append(hemisphere)
            
        except Exception as e:
            hemisphere_image_url.append({
                "title": null,
                "img_url": null
                }) 
        
        browser.back()
    
    # Close the browser after scraping
    browser.quit()
    
    return hemisphere_image_url
    
scrape()

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [None]:
mars_data = {
    "news": mars_news, 
    "image": featured_img_url,
    "weather": mars_weather,
    "facts": space_facts,
    "hemisphere": hemisphere_image_url
    
}