In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
import pandas as pd
from re import search

## NASA Latest News Scraping with Beautifulsoup and Splinter

In [2]:
# Retrieve page with the requests module
url_nasa = 'https://mars.nasa.gov/#news_and_events'
response_nasa = requests.get(url_nasa)

In [3]:
# Create BeautifulSoup object; parse with 'html.parser'
soup_nasa = BeautifulSoup(response_nasa.text, 'html.parser')

In [4]:
# results are returned as an iterable list
results_nasa = soup_nasa.find_all('h3', class_='title')

[<h3 class="title">
<a href="/news/8896/nasa-ingenuity-mars-helicopter-prepares-for-first-flight/">NASA Ingenuity Mars Helicopter Prepares for First Flight</a>
</h3>, <h3 class="title">
<a href="/news/8891/nasa-to-host-briefing-to-preview-first-mars-helicopter-flights/">NASA to Host Briefing to Preview First Mars Helicopter Flights</a>
</h3>, <h3 class="title">
<a href="/news/8892/another-first-perseverance-captures-the-sounds-of-driving-on-mars/">Another First: Perseverance Captures the Sounds of Driving on Mars</a>
</h3>, <h3 class="title">
<a href="https://mars.nasa.gov/technology/helicopter/#Watch-Online" target="_self">Helicopter Flight Preview Webinar</a>
</h3>, <h3 class="title">
<a href="https://mars.nasa.gov/technology/helicopter/" target="_self">Earliest Flight Opportunity for Mars Helicopter</a>
</h3>, <h3 class="title">
<a href="https://mars.nasa.gov/mars-exploration/missions/phoenix/" target="_self">13th Anniversary: Mars Phoenix Landing</a>
</h3>]


In [8]:
nasa_news = {}

article_title = []
article_link = []
article_text = []

# assign article title and url to lists
for result in results_nasa:

    try:
        # Identify and return title of listing
        title = result.a.text

        # Identify and return link to listing
        link = result.a['href']
        if search('http', link):
            continue
        else:
            url_src = url_nasa
            url_split = url_src.split('/#news_and_events')
            link = url_split[0] + link

        # Append results only if title and link are available
        if (title and link):
            article_title.append(title)
            article_link.append(link)
    except AttributeError as e:
        print(e)

 # import lists into nasa_news dictionary

nasa_news['nasa'] = {'title': article_title, 'src': article_link}
nasa_news

{'nasa': {'title': ['NASA Ingenuity Mars Helicopter Prepares for First Flight',
   'NASA to Host Briefing to Preview First Mars Helicopter Flights',
   'Another First: Perseverance Captures the Sounds of Driving on Mars'],
  'src': ['https://mars.nasa.gov/news/8896/nasa-ingenuity-mars-helicopter-prepares-for-first-flight/',
   'https://mars.nasa.gov/news/8891/nasa-to-host-briefing-to-preview-first-mars-helicopter-flights/',
   'https://mars.nasa.gov/news/8892/another-first-perseverance-captures-the-sounds-of-driving-on-mars/']}}

In [67]:
# Retrieve paragraph information from nasa.gov for each top news story

for src in nasa_news['nasa']['src']:
    print(src)
    # Retrieve page with the requests module
    response_nasa = requests.get(src)
    
    # Create BeautifulSoup object; parse with 'html.parser'
    soup_nasa = BeautifulSoup(response_nasa.text, 'html.parser')
    
    # results are returned as an iterable list
    results_nasa = soup_nasa.find('div', class_='wysiwyg_content')
    article_text.append(results_nasa.find_all('p')[1].text)
    print(article_text)


https://mars.nasa.gov/news/8896/nasa-ingenuity-mars-helicopter-prepares-for-first-flight/
NASA is targeting no earlier than April 8 for the Ingenuity Mars Helicopter to make the first attempt at powered, controlled flight of an aircraft on another planet. Before the 4-pound (1.8-kilogram) rotorcraft can attempt its first flight, however, both it and its team must meet a series of daunting milestones.
https://mars.nasa.gov/news/8891/nasa-to-host-briefing-to-preview-first-mars-helicopter-flights/
NASA will hold a virtual media briefing at 1:30 p.m. EDT (10:30 a.m. PDT) Tuesday, March 23, to discuss upcoming activities for the agency’s Ingenuity Mars helicopter. The teams operating Ingenuity and NASA’s Mars 2020 Perseverance rover have chosen the flight zone where the helicopter will attempt the first powered, controlled flights on another planet.
https://mars.nasa.gov/news/8892/another-first-perseverance-captures-the-sounds-of-driving-on-mars/
As the Perseverance rover began to make trac

## Find Latest Image from JPL Using Splinter

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Retrieve page with the requests module
url_jpl = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url_jpl)

In [None]:
browser.links.find_by_partial_text('FULL IMAGE').click()

In [None]:

image_url = browser.find_by_css('img.headerimage.fade-in')
print(image_url['src'])


In [None]:
browser.quit()

In [None]:
url_jpl = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'

response_jpl = requests.get(url_jpl)
# Create BeautifulSoup object; parse with 'html.parser'

soup_jpl = BeautifulSoup(response_jpl.text, 'html.parser')
# results are returned as an iterable list
 
results_jpl = soup_jpl.find_all('img')


In [None]:
results_jpl

In [None]:
url_mars = 'https://space-facts.com/mars/'

response_mars = requests.get(url_mars)
# Create BeautifulSoup object; parse with 'html.parser'

soup_mars = BeautifulSoup(response_mars.text, 'html.parser')
# results are returned as an iterable list

data_mars = []
table = soup_mars.find('table', attrs={'id':'tablepress-p-mars'})
table_body = table.find('tbody')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data_mars.append([ele for ele in cols if ele])

data_mars_earth = []
table = soup_mars.find('table', attrs={'id':'tablepress-comp-mars'})
table_body = table.find('tbody')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data_mars_earth.append([ele for ele in cols if ele])

In [None]:
data_mars_earth

In [None]:
img_data = {'title': [], 'img': []}
title = []
img = []
src = []

moons = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles']

url_mars_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
new = url_mars_hemi.split('/')
url_recon = new[0] + '//' + new[2]

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

for i in range(4):
    
    url_mars_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_mars_hemi)
    browser.links.find_by_partial_text(moons[i]).click()
    new_url = browser.url
    src.append(browser.url)
    response_title = requests.get(new_url)
    # Create BeautifulSoup object; parse with 'html.parser'

    soup_title = BeautifulSoup(response_title.text, 'html.parser')
    # results are returned as an iterable list

    hemi_title = soup_title.find('head')
    title.append(hemi_title.title.text)
    print(hemi_title.title.text)

    hemi_img = soup_title.find('img', class_='thumb')
    img.append(url_recon + hemi_img['src'])
    print(hemi_img['src'])
    
    i += 1
    
browser.quit()

img_data['title'].append(title)
img_data['img'].append(img)

In [None]:
for index in range(len(img_data['title'][0])):
    img_data['title'][0][index] = img_data['title'][0][index].split(" | ")[0]
    img_data['title'][0][index] = img_data['title'][0][index].replace(' Enhanced','')
img_data


In [None]:
img_data['title'][0][1]

In [None]:
url_mars = 'https://space-facts.com/mars/'
tables = pd.read_html(url_mars)
mars_data_tb = tables[0]
mars_earth_tb = tables[1]

In [None]:
mars_data_tb.rename(columns={0:'Attribute', 1:'Value'}, inplace = True)
mars_data_tb.set_index('Attribute', inplace = True)

In [None]:
mars_data_tb

In [None]:
mars_earth_tb.rename(columns={'Mars - Earth Comparison':'Attribute'}, inplace = True)
mars_earth_tb.set_index('Attribute', inplace = True)

In [None]:
mars_earth_tb

In [None]:
mars_earth_html = mars_earth_tb.to_html(header = True, index_names = False)
mars_data_html = mars_data_tb.to_html()

In [None]:
mars_earth_html