In [40]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager 
import pandas as pd

# NASA Mars News

In [14]:
# page uses js to load in content so straight bs scrape won't get everything
# use splinter to open a browser and scrape that
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
There is no [win32] chromedriver for browser 94.0.4606 in cache
Get LATEST driver version for 94.0.4606
Trying to download new driver from https://chromedriver.storage.googleapis.com/94.0.4606.61/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\kelln\.wdm\drivers\chromedriver\win32\94.0.4606.61]


In [15]:
# establish mars news site url
url = 'https://redplanetscience.com/'

# have browser navigate to url
browser.visit(url)

# call html from browser
html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html, 'html.parser')

In [16]:
# call scraping results to investigate structure
soup

<html><head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta1/dist/css/bootstrap.min.css" integrity="sha384-giJF6kkoqNQ00vy+HMDP7azOuL0xtbfIcaT9wjKHr8RbDVddVHyTfAAsrekwKmP1" rel="stylesheet"/>
<link href="css/font.css" rel="stylesheet" type="text/css"/>
<link href="css/app.css" rel="stylesheet" type="text/css"/>
<link crossorigin="anonymous" href="https://pro.fontawesome.com/releases/v5.10.0/css/all.css" integrity="sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p" rel="stylesheet"/>
<title>News - Mars Exploration Program</title>
</head>
<body>
<div class="col-md-12">
<div class="row">
<nav class="navbar navbar-expand-lg navbar-light fixed-top">
<div class="container-fluid">
<a class="navbar-brand" href="#">
<img src="image/nasa.png" width="80"/><span class="logo">MARS Planet Science</span>
<span class="logo1">Exploration Program</spa

Close inspection shows that news is displayed with most recent result coming first.

In [26]:
# find first result for 'content_title' and save the text as a string
news_title = soup.find('div', class_='content_title').text

# find first result for 'article_teaser_body' and save the text as a string
news_p = soup.find('div', class_='article_teaser_body').text

# quit browser
browser.quit()

# JPL Mars Space Images - Featured Image

In [36]:
# establish url for this section
image_url = 'https://spaceimages-mars.com/'

# open new browser and navigate to page
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(image_url)

In [37]:
# call html from browser
img_html = browser.html 

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(img_html, 'html.parser')

Using chrome Inspect tool in the browser to examine the html structure, it can be determined that the featured image is 
stored in an 'img' tag with the class 'headerimage fade-in'.

In [38]:
# use the beautifulSoup 'find' function to determine the source path for the image
featured_img_path = soup.find('img', class_='headerimage fade-in')['src']

# add the image path to the url to find the full url for the featured image
featured_image_url = image_url + featured_img_path
featured_image_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

In [39]:
# close the browser
browser.quit()

# Mars Facts

In [101]:
# establish url for mars facts page
facts_url = 'https://galaxyfacts-mars.com/'

# open a new browser and navigate to the url
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(facts_url)

In [102]:
# call html from browser
facts_html = browser.html

In [107]:
# use pandas read_html function to scrape table data from the html
tables = pd.read_html(facts_html, header=0)

# store the tables separately
comparisons_df = tables[0]
profile_df = tables[1]

In [108]:
# adjust comparison table formatting
comparisons_df = comparisons_df.rename(columns={'Mars - Earth Comparison': 'Attribute'})
comparisons_df

Unnamed: 0,Attribute,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-87 to -5 °C,-88 to 58°C


In [105]:
# close the browser
browser.quit()

In [109]:
# convert the dfs to html table strings
comparisons_html = comparisons_df.to_html(classes='table table-striped')
profile_html = profile_df.to_html(classes='table table-striped')

# Mars Hemispheres

In [55]:
# establish url for mars hemispheres page
hemi_url = 'https://marshemispheres.com/'

# open a new browser and navigate to the url
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(hemi_url)

In [56]:
# call html from browser
hemi_html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(hemi_html, 'html.parser')

In [71]:
# find all items in the hemisphere list
items = soup.find_all('a', class_='itemLink product-item')

# establish empty lists for links and titles
links = []

# iterate through results to find all links
for item in items:
    # attempt to pull link from item, if there isn't one continue to next iteration
    try:
        link = item['href']
    except:
        continue
    
    # avoid adding duplicates to list of links
    if link not in links:
        links.append(link)
        
# remove useless link from list
links.remove('#')
links

['cerberus.html', 'schiaparelli.html', 'syrtis.html', 'valles.html']

In [98]:
# create empty list for result dictionaries
hemisphere_image_urls = []

# navigate to each subpage to find title and image link
for link in links:
    sub_url = hemi_url + link
    
    # navigate to subpage
    browser.visit(sub_url)
    
    # call page html
    sub_html = browser.html
    
    # Parse HTML with Beautiful Soup
    soup = bs(sub_html, 'html.parser')
    
    # Find the title on the page
    title = soup.find('h2', class_='title').text
    
    # find the link to the 'Original' image on each page
    href = soup.find('a', string='Original')['href']
    
    # create full image url
    img_result_url = hemi_url + href
    
    # create result dictionary
    hemi_dict = {'title': title, 'img_url': img_result_url}
    
    # append dictionary to list
    hemisphere_image_urls.append(hemi_dict)
    
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/cerberus_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced.tif'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced.tif'}]

In [99]:
# close browser
browser.quit()