In [1]:
# Dependencies
import pymongo
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
executable_path = {"executable_path": ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless=False)



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Get LATEST driver version for 93.0.4577
Trying to download new driver from https://chromedriver.storage.googleapis.com/93.0.4577.63/chromedriver_mac64.zip
Driver has been saved in cache [/Users/seong-minkim/.wdm/drivers/chromedriver/mac64/93.0.4577.63]


## NASA Mars News

Scraping the title and paragraph text of the latest news

In [3]:
# URL of page to be scraped
url = "https://redplanetscience.com/"
browser.visit(url)
html = browser.html

In [4]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html, 'html.parser')

In [5]:
# The latest news title is the first title
news_title = soup.find('div', class_="content_title").get_text()
news_title

'Space History Is Made in This NASA Robot Factory'

In [6]:
# The paragraph text of the latest news
news_p = soup.find('div', class_="article_teaser_body").get_text()
news_p

"From rockets to rovers, JPL's Spacecraft Assembly Facility has been at the center of robotic spaceflight. Here's a closer look at what makes it so special."

## JPL Mars Space Images - Featured Image

In [7]:
# URL of page to be scraped
url_images = "https://spaceimages-mars.com/"
browser.visit(url_images)
html_images = browser.html

In [8]:
# Create BeautifulSoup object; parse with 'html.parser'
soup_images = bs(html_images, 'html.parser')

In [9]:
# Get URL for the featured image
result = soup_images.find_all("img", class_="headerimage fade-in")

for r in result:
    src = r.get("src")
    featured_image_url = f"{url_images}{src}"
    print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars3.jpg


## Mars Facts

Use Pandas to scrape the table containing facts about the planet

In [10]:
# URL of page to be scraped
url_facts = "https://galaxyfacts-mars.com/"
tables = pd.read_html(url_facts)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [11]:
# Build dataframe around the information about the planet's diameter, mass, etc. So we don't need table[0]
facts_df = tables[1]

# Set column names
facts_df.columns = [" ", "Facts"]

# Reset the index
facts_df.set_index(" ", inplace=True)

# Show Dataframe
facts_df

Unnamed: 0,Facts
,
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 ( Phobos & Deimos )
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [12]:
facts_html = facts_df.to_html()
facts_html = facts_html.replace('\n', '')

In [13]:
facts_html

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Facts</th>    </tr>    <tr>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 ( Phobos &amp; Deimos )</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

## Mars Hemispheres

In [14]:
# URL of page to be scraped
url_hemi = "https://marshemispheres.com/"
browser.visit(url_hemi)
html_hemi = browser.html

In [15]:
# Create BeautifulSoup object; parse with 'html.parser'
soup_hemi = bs(html_hemi, 'html.parser')

In [39]:
# Get results for title of each hemisphere
results_hemi = soup_hemi.find_all('h3')
results_tag = soup_hemi.find_all('div', class_="description")

# Iterate through the results_hemi to extract the title and url of each hemisphere
hemisphere_image_urls = []

for x in range(4):
    
    # Title
    img_title = results_hemi[x].text
    
    # Images
    browser.visit(f"{url_hemi}{results_tag[x].a['href']}")
    html_x = browser.html
    soup_tag = bs(html_x, "html.parser")
    tag = soup_tag.find('img', class_="wide-image").get("src")
    img_url = f"{url_hemi}{tag}"
    
    # Build the dictionary
#    post = {"title": img_title, "img_url": img_url}
#    hemisphere_image_urls.append(post)
    post = {}
    post['title'] = img_title
    post['img_url'] = img_url
    hemisphere_image_urls.append(post)
    
print(hemisphere_image_urls)

AttributeError: 'dict' object has no attribute 'append'

In [38]:
hemisphere_image_urls.dtypes()

AttributeError: 'list' object has no attribute 'dtypes'