In [2]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import pymongo
import pandas as pd

In [3]:
# URL of page to be scraped
news_url = 'https://mars.nasa.gov/news' 
img_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
twiter_url = "https://twitter.com/marswxreport?lang=en"
facts_url ="https://space-facts.com/mars/"
hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [4]:
def init_browser(headless):
    # @NOTE: Replace the path with your actual path to the chromedriver /usr/local/bin/
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=headless)

In [5]:
def visit_url(url, tag="body", class_="",headless=True):
    browser = init_browser(headless)
    browser.visit(url)
    html = browser.html
    
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'lxml')
    
    #print(soup.prettify())
    if class_:
        results = soup.find_all(tag, class_=class_)
    else:
        results = soup.find_all(tag)
    
    return results

In [8]:
#retrieve title and paragraph from news_url
nasa_results = visit_url(news_url, 'div','list_text')
nasa_title = nasa_results[0].find('div', class_='content_title').a.text
nasa_p = nasa_results[0].find('div', class_='article_teaser_body').text
nasa_date = nasa_results[0].find('div', class_='list_date').text
print(f"article title: {nasa_title}\narticle date: {nasa_date}\nparagraph: {nasa_p}")

article title: What's Mars Solar Conjunction, and Why Does It Matter?
article date: August 23, 2019
paragraph: NASA spacecraft at Mars are going to be on their own for a few weeks when the Sun comes between Mars and Earth, interrupting communications.


In [6]:
# feature image
# image_results = visit_url(img_url, "a","button fancybox")
image_results = visit_url(img_url, "article","carousel_item")
featured_image_url = img_url + image_results[0].a["data-fancybox-href"]
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/mediumsize/PIA19637_ip.jpg


In [7]:
twiter_results = visit_url(twiter_url, "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
mars_weather = twiter_results[0].text.replace("hPapic.twitter.com/MhPPOHJg3m","")
print(mars_weather)

InSight sol 261 (2019-08-21) low -102.4ºC (-152.4ºF) high -26.6ºC (-15.8ºF)
winds from the SSE at 4.9 m/s (11.0 mph) gusting to 16.0 m/s (35.8 mph)
pressure at 7.70 


In [8]:
fact_results = pd.read_html(facts_url)[1].rename(columns={0:'',1:'value'})
fact_results.set_index('',inplace=True)
fact_dict = fact_results.to_dict()
fact_dict

{'value': {'Equatorial Diameter:': '6,792 km',
  'Polar Diameter:': '6,752 km',
  'Mass:': '6.39 × 10^23 kg (0.11 Earths)',
  'Moons:': '2 (Phobos & Deimos)',
  'Orbit Distance:': '227,943,824 km (1.38 AU)',
  'Orbit Period:': '687 days (1.9 years)',
  'Surface Temperature:': '-87 to -5 °C',
  'First Record:': '2nd millennium BC',
  'Recorded By:': 'Egyptian astronomers'}}

In [9]:
hemisphere_results = visit_url(hemisphere_url, "div", "description")
hemisphere_baseurl = "https://astrogeology.usgs.gov"
hemisphere_image_urls = [{"title":hemisphere.h3.text, \
                          "img_url":visit_url(hemisphere_baseurl+hemisphere.a['href'], 'li')[0].a['href']}\
                        for hemisphere in hemisphere_results]
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [10]:
def scrape_info():

    # URL of page to be scraped
    news_url = "https://mars.nasa.gov/news"
    img_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    twiter_url = "https://twitter.com/marswxreport?lang=en"
    facts_url ="https://space-facts.com/mars/"
    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    # mars news headline and paragraph from news_url
    nasa_results = visit_url(news_url, 'div','list_text',False)
    nasa_title = nasa_results[0].find('div', class_='content_title').a.text
    nasa_p = nasa_results[0].find('div', class_='article_teaser_body').text

    # mars feature image
    image_results = visit_url(img_url, "article","carousel_item")
    featured_image_url = img_url + image_results[0].a["data-fancybox-href"]

    # mars weather
    twiter_results = visit_url(twiter_url, "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
    mars_weather = twiter_results[0].text.replace("hPapic.twitter.com/MhPPOHJg3m","")

    # mars facts
    fact_results = pd.read_html(facts_url)[1].rename(columns={0:'',1:'value'})
    fact_results.set_index('',inplace=True)
    mars_facts = fact_results.to_dict()

    # mars hemispheres images
    hemisphere_results = visit_url(hemisphere_url, "div", "description")
    hemisphere_baseurl = "https://astrogeology.usgs.gov"
    hemisphere_image_urls = [{"title":hemisphere.h3.text, \
                            "img_url":visit_url(hemisphere_baseurl+hemisphere.a['href'], 'li')[0].a['href']}\
                            for hemisphere in hemisphere_results]
    
    mars_data = {
        "nasa_title" : nasa_title,
        "nasa_p" : nasa_p,
        "featured_image_url" : featured_image_url,
        "mars_weather" : mars_weather,
        "mars_facts" : mars_facts,
        "hemisphere_image_urls" : hemisphere_image_urls
    }

    return mars_data

In [None]:
# Examine the results, then determine element that contains sought info
def examine_mars_news():
    # Loop through returned results

    for result in nasa_results:
        # Error handling
        try:
            # Identify and return headline
            title = result.find('div', class_='content_title').a.text
            # Identify and return article's date
            date = result.find('div', class_='list_date').text
            # Identify and return paragraph
            paragraph = result.find('div', class_='article_teaser_body').text

            # Run only if title, price, and link are available
            if (title or date or paragraph):
                # Print results
                print('-------------')
                print(title)
                print(date)
                print(paragraph)

        except Exception as e:
            print(e)

In [None]:
examine_mars_news()

In [1]:
mars_data = scrape_info()
mars_data

NameError: name 'scrape_info' is not defined

In [22]:
mars_data['mars_facts']['value']

{'Equatorial Diameter:': '6,792 km',
 'Polar Diameter:': '6,752 km',
 'Mass:': '6.39 × 10^23 kg (0.11 Earths)',
 'Moons:': '2 (Phobos & Deimos)',
 'Orbit Distance:': '227,943,824 km (1.38 AU)',
 'Orbit Period:': '687 days (1.9 years)',
 'Surface Temperature:': '-87 to -5 °C',
 'First Record:': '2nd millennium BC',
 'Recorded By:': 'Egyptian astronomers'}

In [23]:
data = mars_data['mars_facts']['value']

html = '<table><tr><th></th></tr><th>Value</th></tr><tr><th>Description</th></tr><th></th></tr>'
# .join(data.keys())
for row in zip(*data):
    html += '<tr><td>' + '</td><td>'.join(row) + '</td></tr>'

html += '</table>'

print(html)

<table><tr><th>Equatorial Diameter:</th><th>Polar Diameter:</th><th>Mass:</th><th>Moons:</th><th>Orbit Distance:</th><th>Orbit Period:</th><th>Surface Temperature:</th><th>First Record:</th><th>Recorded By:</th></tr><tr><td>6</td><td>6</td><td>6</td><td>2</td><td>2</td><td>6</td><td>-</td><td>2</td><td>E</td></tr><tr><td>,</td><td>,</td><td>.</td><td> </td><td>2</td><td>8</td><td>8</td><td>n</td><td>g</td></tr><tr><td>7</td><td>7</td><td>3</td><td>(</td><td>7</td><td>7</td><td>7</td><td>d</td><td>y</td></tr><tr><td>9</td><td>5</td><td>9</td><td>P</td><td>,</td><td> </td><td> </td><td> </td><td>p</td></tr><tr><td>2</td><td>2</td><td> </td><td>h</td><td>9</td><td>d</td><td>t</td><td>m</td><td>t</td></tr><tr><td> </td><td> </td><td>×</td><td>o</td><td>4</td><td>a</td><td>o</td><td>i</td><td>i</td></tr><tr><td>k</td><td>k</td><td> </td><td>b</td><td>3</td><td>y</td><td> </td><td>l</td><td>a</td></tr><tr><td>m</td><td>m</td><td>1</td><td>o</td><td>,</td><td>s</td><td>-</td><td>l</td><td>n</

In [25]:
d = mars_data['mars_facts']['value']
keys = d.keys()
length = len(d[keys[0]])

items = ['<table style="width:300px">', '<tr>']
for k in keys:
    items.append('<td>%s</td>' % k)
items.append('</tr>')

for i in range(length):
    items.append('<tr>')
    for k in keys:
        items.append('<td>%s</td>' % d[k][i])
    items.append('</tr>')

items.append('</table>')

print('\n'.join(items))

TypeError: 'dict_keys' object does not support indexing