In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Import scraping tools
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
# Set up Splinter
executable_path = {"executable_path": ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless = False)



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [/Users/jenamis/.wdm/drivers/chromedriver/mac64/100.0.4896.60/chromedriver] found in cache


### News Article

In [4]:
# Visit NASA Mars News site
url = "https://redplanetscience.com/"
browser.visit(url)

# Optional delay for loading page
browser.is_element_present_by_css("div.list_text", wait_time = 1)

True

In [5]:
# Parse HTML
html = browser.html
news_soup = soup(html, "html.parser")
slide_elem = news_soup.select_one("div.list_text")

In [6]:
# Scrape for most recent article title
slide_elem.find("div", class_="content_title")

<div class="content_title">With Mars Methane Mystery Unsolved, Curiosity Serves Scientists a New One: Oxygen</div>

In [7]:
# Use parent element to find first "div" tag and save it as "news_title"
news_title = slide_elem.find("div", class_="content_title").get_text()
news_title

'With Mars Methane Mystery Unsolved, Curiosity Serves Scientists a New One: Oxygen'

In [8]:
# Use parent element to find article summary
news_p = slide_elem.find("div", class_="article_teaser_body").get_text()
news_p

'For the first time in the history of space exploration, scientists have measured the seasonal changes in the gases that fill the air directly above the surface of Gale Crater on Mars. '

### Featured Image

In [9]:
# Visit Space Images site
url = "https://spaceimages-mars.com/"
browser.visit(url)

In [10]:
# Find and click full image button
full_image_elem = browser.find_by_tag("button")[1]
full_image_elem.click()

In [11]:
# Parse HTML
html = browser.html
img_soup = soup(html, "html.parser")

In [12]:
# Find relative image URL
img_url_rel = img_soup.find("img", class_="fancybox-image").get("src")
img_url_rel

'image/featured/mars2.jpg'

In [13]:
# Use base URL to create absolute URL
img_url = f"https://spaceimages-mars.com/{img_url_rel}"
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

### Facts

In [14]:
# Scrape table from Mars Facts site
df = pd.read_html("https://galaxyfacts-mars.com/")[0]
df.columns = ["Description", "Mars", "Earth"]
df.set_index("Description", inplace = True)
df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [15]:
# Convert DF to HTML
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

# D1: Scrape High-Resolution Mars Hemisphere Images and Titles

### Hemispheres

In [16]:
# Visit Mars Hemispheres site
url = "https://marshemispheres.com/"
browser.visit(url)

In [17]:
# Create list to hold image URLs and titles
hemisphere_image_urls = []

# Retrieve image URLs and titles for each hemisphere
# Loop through hemispheres
for x in range(4):
    
    # Click on hemisphere link
    browser.links.find_by_partial_text("Hemisphere Enhanced")[x].click()
    
    # Parse HTML
    html = browser.html
    hemisphere_soup = soup(html, "html.parser")
    
    # Retrieve image URL
    img_href = hemisphere_soup.find("li").a["href"]
    img_url = f"{url}{img_href}"
    
    # Retrieve title
    title = hemisphere_soup.find("h2", class_="title").text.strip()
    
    # Create empty hemispheres dictionary
    hemispheres = {}
    
    # Add image URL and title to dictionary
    hemispheres["img_url"] = img_url
    hemispheres["title"] = title
    
    # Append dictionary to list
    hemisphere_image_urls.append(hemispheres)
    
    # Return to previous page
    browser.back()

In [18]:
# Print list that holds dictionary of each image URL and title
hemisphere_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [19]:
# End automated browsing session
browser.quit()