In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from splinter import Browser
import time
import urllib.request as req

In [2]:
# Boot the Chrome Driver & Assign HTML
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless = False)

# Scraping NASA pages

In [3]:
# URL to be scraped (https://mars.nasa.gov/ page)
url1 = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve the data
response = requests.get(url1)

# Create a Beautiful Soup object
soup1 = bs(response.text, "html5lib")
type(soup1)

bs4.BeautifulSoup

In [4]:
# Extract the text from the class="content_title" and clean up the text use strip
news_title = soup1.find_all('div', class_= 'content_title')[0].find('a').text.strip()
print(news_title)

NASA's InSight Places First Instrument on Mars


In [5]:
# Extract the paragraph from the class="rollover_description_inner" and clean up the text use strip
news_p = soup1.find_all('div', class_='rollover_description_inner')[0].text.strip()
print(news_p)

In deploying its first instrument onto the surface of Mars, the lander completes a major mission milestone.


In [6]:
# URL of page to be scraped
url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

# Visit the page using the browser
browser.visit(url2)

In [7]:
# Assign html content
html = browser.html

# Create a Beautiful Soup object
soup2 = bs(html, "html5lib")

In [8]:
# Scrape Path for the Feature Image. got the partial path of the url
partial_address = soup2.find_all('a', class_='fancybox')[0].get('data-fancybox-href').strip()
print(partial_address)

/spaceimages/images/mediumsize/PIA14924_ip.jpg


In [9]:
# Combine the root url to get the full address
featured_image_url = "https://www.jpl.nasa.gov" + partial_address

# Print to check the full URL
print(featured_image_url)

# Browse to check url
browser.visit(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA14924_ip.jpg


In [10]:
# Retrieve image url
imgurl = "https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17838_ip.jpg"

req.urlretrieve(imgurl, "featured_image.jpg")

('featured_image.jpg', <http.client.HTTPMessage at 0x904f7f0>)

# Scraping Mars Weather twitter https://twitter.com/marswxreport?lang=en

In [11]:
# Assign URL
url3 = 'https://twitter.com/marswxreport?lang=en'

# Visit the page using the browser
browser.visit(url3)

In [12]:
# Reassign html content
html = browser.html

# Create a Beautiful Soup object
soup3 = bs(html, "html5lib")

In [13]:
# Scrape latest Mars weather tweet
mars_weather = soup3.find_all('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')[0].text
print(mars_weather)

Sol 2291 (2019-01-16), high -8C/17F, low -70C/-93F, pressure at 8.23 hPa, daylight 06:46-18:55pic.twitter.com/IRiqxlJqvT


# Scraping Space Facts https://space-facts.com/mars/

In [14]:
# URL of page to be scraped
url4 = 'https://space-facts.com/mars/'

# Visit the page using the browser
browser.visit(url4)

In [15]:
#  Use Pandas to get the url table
tables = pd.read_html(url4)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [16]:
# Convert list of table into pandas dataframe
df = tables[0]

# Update column names
df.columns=['description','value']
df

Unnamed: 0,description,value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [17]:
# Turn the description header into the index
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [18]:
# Use pandas to  generate HTML tables from DataFrames and save as html file
df.to_html('table.html')

# Mars Hemispheres using the WayBack Machine at Archives.org (11/14/2018)

In [19]:
# URL of page to be scraped
url5 = 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

#Visit the page using the browser
browser.visit(url5)

# Create empty list
hemisphere_image_urls = []

# Create empty dict
dict = {}

In [20]:
# Reassign html content
html = browser.html

# Create a Beautiful Soup object
soup5 = bs(html,"html5lib")

In [21]:
# Setup to loop through <h3></h3>
scrapes = soup5.find_all('h3')

# Loop through each result
for scrape in scrapes:
    # Get text info from result
    itema = scrape.text
    print(f'Scraping: {itema}')
    time.sleep(1)    
    browser.click_link_by_partial_text(itema)
    time.sleep(1)
    # Reassign html content
    htmla = browser.html
    # Create a Beautiful Soup object
    soupa = bs(htmla,"html5lib")
    time.sleep(1)
    # Grab the image link
    linka = soupa.find_all('div', class_="downloads")[0].find_all('a')[0].get("href")
    # Pass title to dict
    time.sleep(1)
    dict["title"]=itema
    # Pass url to dict
    dict["img_url"]=linka
    print(f'Adding {itema} to list...')
    # Append dict to the list 
    hemisphere_image_urls.append(dict)
    # Clean Up dict
    dict = {}
    browser.visit(url5)
    time.sleep(1)

# Print the finishing statement
print("-----------------------------") 
print("Web Scraping Complete")
print("-----------------------------")

Scraping: Cerberus Hemisphere Enhanced
Adding Cerberus Hemisphere Enhanced to list...
Scraping: Schiaparelli Hemisphere Enhanced
Adding Schiaparelli Hemisphere Enhanced to list...
Scraping: Syrtis Major Hemisphere Enhanced
Adding Syrtis Major Hemisphere Enhanced to list...
Scraping: Valles Marineris Hemisphere Enhanced
Adding Valles Marineris Hemisphere Enhanced to list...
-----------------------------
Web Scraping Complete
-----------------------------


In [22]:
# Review List
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://web.archive.org/web/20181114182238/http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://web.archive.org/web/20181114182242/http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://web.archive.org/web/20181114182245/http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://web.archive.org/web/20181114182248/http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]