In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
import pandas as pd
from re import search
import time

## NASA Latest News Scraping with Beautifulsoup and Splinter

In [2]:
# Retrieve page with the requests module
url_nasa = 'https://mars.nasa.gov/#news_and_events'
response_nasa = requests.get(url_nasa)

In [3]:
# Create BeautifulSoup object; parse with 'html.parser'
soup_nasa = bs(response_nasa.text, 'html.parser')

In [4]:
# results are returned as an iterable list
results_nasa = soup_nasa.find_all('h3', class_='title')

In [5]:
nasa_news = {}

article_title = []
article_link = []
article_text = []

# assign article title and url to lists
for result in results_nasa:

    try:
        # Identify and return title of listing
        title = result.a.text

        # Identify and return link to listing
        link = result.a['href']
        if search('http', link):
            continue
        else:
            url_src = url_nasa
            url_split = url_src.split('/#news_and_events')
            link = url_split[0] + link

        # Append results only if title and link are available
        if (title and link):
            article_title.append(title)
            article_link.append(link)
    except AttributeError as e:
        print(e)

        
for src in article_link:

    # Retrieve page with the requests module
    response_nasa = requests.get(src)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_nasa = bs(response_nasa.text, 'html.parser')

    # results are returned as an iterable list
    results_nasa = soup_nasa.find('div', class_='wysiwyg_content')
    article_text.append(results_nasa.find_all('p')[1].text)

# import lists into nasa_news dictionary
nasa_news['nasa'] = {'title': article_title, 'src': article_link, 'parag': article_text}

nasa_news

{'nasa': {'title': ['NASA Ingenuity Mars Helicopter Prepares for First Flight',
   'NASA to Host Briefing to Preview First Mars Helicopter Flights',
   'Another First: Perseverance Captures the Sounds of Driving on Mars'],
  'src': ['https://mars.nasa.gov/news/8896/nasa-ingenuity-mars-helicopter-prepares-for-first-flight/',
   'https://mars.nasa.gov/news/8891/nasa-to-host-briefing-to-preview-first-mars-helicopter-flights/',
   'https://mars.nasa.gov/news/8892/another-first-perseverance-captures-the-sounds-of-driving-on-mars/'],
  'parag': ['NASA is targeting no earlier than April 8 for the Ingenuity Mars Helicopter to make the first attempt at powered, controlled flight of an aircraft on another planet. Before the 4-pound (1.8-kilogram) rotorcraft can attempt its first flight, however, both it and its team must meet a series of daunting milestones.',
   'NASA will hold a virtual media briefing at 1:30 p.m. EDT (10:30 a.m. PDT) Tuesday, March 23, to discuss upcoming activities for the a

## Find Latest Image from JPL Using Splinter

In [6]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# Retrieve page with the requests module
url_jpl = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url_jpl)

# click on the image
browser.links.find_by_partial_text('FULL IMAGE').click()

# locate image source using splinter method
image_url = browser.find_by_css('img.headerimage.fade-in')['src']
print(image_url)

# Save to dictionary
nasa_news['mars'] = {'title': 'JPL Featured Image', 'src':image_url}

# quit browser
browser.quit()

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389






[WDM] - Driver [C:\Users\jtstr\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars2.jpg


## Scrape data tables about Mars from Space-facts using pandas

In [7]:
# Use pandas to read the tables on the site
url_mars = 'https://space-facts.com/mars/'
tables = pd.read_html(url_mars)
mars_data_tb = tables[0]
mars_earth_tb = tables[1]

# Rename columns
mars_data_tb.rename(columns={0:'Attribute', 1:'Value'}, inplace = True)
mars_data_tb.set_index('Attribute', inplace = True)

mars_earth_tb.rename(columns={'Mars - Earth Comparison':'Attribute'}, inplace = True)
mars_earth_tb.set_index('Attribute', inplace = True)

# generate HTML code for each table
mars_earth_html = mars_earth_tb.to_html(index_names = False, border = 0, justify = 'left')
mars_data_html = mars_data_tb.to_html(index_names = False, border = 0, justify = 'left')

# add to dictionary
nasa_news['mars_facts'] = mars_data_html
nasa_news['mars_earth_facts'] = mars_earth_html

# print one table
print(mars_data_html)

<table border="0" class="dataframe">
  <thead>
    <tr style="text-align: left;">
      <th></th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


## Find image data for mars hemispheres from astrogeology using splinter and beautifulsoup

In [8]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# set url
url_mars_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# Setup dictionary and list variables
img_data = {}
title = []
img = []

# parse url to add to the image cache links (later)
new = url_mars_hemi.split('/')
url_recon = new[0] + '//' + new[2]

# start loop through four hemispheres
# use splinter to select each image url in order to capture the source url correctly
# use beautifulsoup to capture data

hemis = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles']

for i in range(4):

    url_mars_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_mars_hemi)

    time.sleep(5)

    browser.links.find_by_partial_text(hemis[i]).click()
    
    new_url = browser.url

    # Create BeautifulSoup object; parse with 'html.parser'
    response_title = requests.get(new_url)
    
    # results are returned as an iterable list
    soup_title = bs(response_title.text, 'html.parser')
   
    # save image title
    hemi_title = soup_title.find('head')
    title.append(hemi_title.title.text)
    
    # save image source
    hemi_img = soup_title.find('img', class_='thumb')
    img.append(url_recon + hemi_img['src'])

    i += 1

browser.quit()

# clean up title a little
for index in range(len(title)):
    title[index] = title[index].split(" | ")[0]
    title[index] = title[index].replace(' Enhanced','')

# add data to dictionary
nasa_news['hemi'] = {'name': title, 'src': img}

# print results
print(title)
print(img)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389






[WDM] - Driver [C:\Users\jtstr\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


['Cerberus Hemisphere', 'Schiaparelli Hemisphere', 'Syrtis Major Hemisphere', 'Valles Marineris Hemisphere']
['https://astrogeology.usgs.gov/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png', 'https://astrogeology.usgs.gov/cache/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png', 'https://astrogeology.usgs.gov/cache/images/55a0a1e2796313fdeafb17c35925e8ac_syrtis_major_enhanced.tif_thumb.png', 'https://astrogeology.usgs.gov/cache/images/4e59980c1c57f89c680c0e1ccabbeff1_valles_marineris_enhanced.tif_thumb.png']
