In [1]:
import traceback
import sys
from bs4 import BeautifulSoup as bs4
import pandas as pd 
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# URLs to scrape
nasa_news = "https://mars.nasa.gov/news/"
mars_images = "https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html"
mars_facts = "https://space-facts.com/mars/"
astro_url = "https://astrogeology.usgs.gov"
mars_hemispheres = astro_url + "/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [3]:
# browser class to enable scraping through chrome
# Each creation of ChromeBrowser object will open then close a new browser to perform an operation
class ChromeBrowser(object):

    def __init__(self, url):
        self.url = url

    def __enter__(self):
        executable_path = {'executable_path': ChromeDriverManager().install()}
        self.browser = Browser('chrome', **executable_path, headless=False)
        self.browser.visit(self.url)
        return self.browser

    def __exit__(self, ex_type, val, tb):
        if (ex_type is not None):
            traceback.print_exception(ex_type, val, tb)
        if (self.browser is not None):
            self.browser.quit()
        return True

In [4]:
# Scrape NASA News site
# Collect news_title and news_p from site and store for later use
news_title = ""
news_p = ""

with ChromeBrowser(nasa_news) as browser: 
    html = browser.html
    soup = bs4(html, "html.parser")
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text

print(news_title)
print('---------------------')
print(news_p)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389


[WDM] - Driver [C:\Users\Darbots\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache
Traceback (most recent call last):
  File "<ipython-input-4-a0efd0b28842>", line 10, in <module>
    news_p = soup.find('div', class_='article_teaser_body').text
AttributeError: 'NoneType' object has no attribute 'text'
Mars Now
---------------------



In [32]:
image_url = ""

with ChromeBrowser(mars_images) as browser:
    html = browser.html
    soup = bs4(html, "html.parser")
    browser.links.find_by_partial_text("FULL IMAGE").click()
    html = browser.html
    soup = bs4(html, "html.parser")
    image_url = mars_images.replace("index.html", soup.find(class_="fancybox-image")["src"])
print(image_url)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\Darbots\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg


In [18]:
with ChromeBrowser(mars_facts) as browser:
    mars_facts_frame = pd.read_html(browser.html, match="Mars")[0]
    mars_facts_frame = mars_facts_frame.set_index("Mars - Earth Comparison", drop=True)
    mars_facts_frame.columns = [col.replace(":","") for col in mars_facts_frame.columns]
    #mars_facts_frame.drop('Earth', axis=1, inplace=True) #If you want to get rid of Earth Comparison
mars_facts_frame

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\Darbots\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache




Unnamed: 0_level_0,Mars,Earth
Mars - Earth Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [21]:
image_urls = []

with ChromeBrowser(mars_hemispheres) as browser:
    html = browser.html
    soup = bs4(html, "html.parser")
    res = soup.find(class_="result-list").find_all(class_="item")
    for r in res: 
        hemi = {}
        hemi["title"] = r.find("h3").text
        browser.links.find_by_partial_text(hemi["title"]).click()
        html = browser.html
        soup = bs4(html, "html.parser")
        hemi["img_url"] = astro_url + soup.find(class_="wide-image")["src"]
        image_urls.append(hemi)
        browser.back()

for url in image_urls:
    print(url)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\Darbots\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'}
{'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'}
{'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'}
{'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}
