In [5]:
import time
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

In [6]:
def init_browser():
    executable_path = {'executable_path': 'chromedriver'}
    return Browser('chrome', **executable_path)

In [7]:
def scrape_news():
    browser = init_browser()
    info = {}
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    info["headline"] = soup.find("div", class_="content_title").get_text()
    info["date"] = soup.find("div", class_="list_date").get_text()
    info["teaser"] = soup.find("div", class_="article_teaser_body").get_text()
    browser.quit()

    return info

In [8]:
scrape_news()

{'date': 'May 23, 2018',
 'headline': 'InSight Steers Toward Mars',
 'teaser': 'The spacecraft has completed its first trajectory correction maneuver.'}

In [9]:
def scrape_img():
    browser = init_browser()
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    base_url = "https://www.jpl.nasa.gov"
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    img_path = soup.find("a", class_="button fancybox")['data-fancybox-href']
    full_img_url = base_url + img_path
    browser.quit()
    return full_img_url

In [10]:
scrape_img()

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17462_ip.jpg'

In [11]:
# div, class="js-tweet-text-container"
def scrape_weather():
    browser = init_browser()
    url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    latest_weather = soup.find("div", class_="js-tweet-text-container").get_text().replace('\n','')
    browser.quit()
    return latest_weather


In [12]:
scrape_weather()

'Sol 2060 (May 23, 2018), Sunny, high 4C/39F, low -73C/-99F, pressure at 7.43 hPa, daylight 05:20-17:20'

In [13]:
def scrape_facts():
    mars_info_dict = {}
    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url)
    mars_info = tables[0]
    categories = list(mars_info[0])
    facts = list(mars_info[1])
    for i in range(len(categories)):
        categories[i] =categories[i].replace(":","")
    mars_info_dict = dict(zip(categories,facts))
    return mars_info_dict

In [14]:
scrape_facts()

{'Equatorial_Diameter': '6,792 km',
 'First_Record': '2nd millennium BC',
 'Mass': '6.42 x 10^23 kg (10.7% Earth)',
 'Moons': '2 (Phobos & Deimos)',
 'Orbit_Distance': '227,943,824 km (1.52 AU)',
 'Orbit_Period': '687 days (1.9 years)',
 'Polar_Diameter': '6,752 km',
 'Recorded_By': 'Egyptian astronomers',
 'Surface_Temperature': '-153 to 20 °C'}

In [15]:
# scrape href and gather them
# https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars

def scrape_url_list():
    url_list = []
    browser = init_browser()
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    base_url = "https://astrogeology.usgs.gov"
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    for item in soup.find_all("div", class_="item"):
        for a in item.find_all("a", href=True):
            url_list.append(base_url + a['href'])
    url_list = list(set(url_list))
    browser.quit()
    return url_list

In [16]:
scrape_url_list()

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [17]:
def scrape_href_list():
    href_list = []
    dict_item = {}
    
    url_list = scrape_url_list()
    browser = init_browser()
    for url in url_list:
        browser.visit(url)
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("h2", class_="title").get_text()
        for div in soup.find_all('div', class_="downloads"):
            dict_item = {
                "title": title,
                "href": div.find('a')['href']
            }
        href_list.append(dict_item)
    browser.quit()
    return href_list

In [18]:
scrape_href_list()

[{'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [20]:
def scrape():
    info = {
        "news" : scrape_news(),
        "img_url" : scrape_img(),
        "weather" : scrape_weather(),
        "facts" : scrape_facts(),
        "img_list" : scrape_href_list()
    }
    return info

In [21]:
from pprint import pprint
pprint(scrape())

{'facts': {'Equatorial_Diameter': '6,792 km',
           'First_Record': '2nd millennium BC',
           'Mass': '6.42 x 10^23 kg (10.7% Earth)',
           'Moons': '2 (Phobos & Deimos)',
           'Orbit_Distance': '227,943,824 km (1.52 AU)',
           'Orbit_Period': '687 days (1.9 years)',
           'Polar_Diameter': '6,752 km',
           'Recorded_By': 'Egyptian astronomers',
           'Surface_Temperature': '-153 to 20 °C'},
 'img_list': [{'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
               'title': 'Cerberus Hemisphere Enhanced'},
              {'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
               'title': 'Schiaparelli Hemisphere Enhanced'},
              {'href': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
               'title': 'Syrtis Major Hemisphere Enhanced'},
              {'hre

In [25]:
dictionary = {"a":1, "b":2}
for key in dictionary.keys():
    print("{},{}".format(key, dictionary[key]))

b,2
a,1
