## 1.) Dependencies and Setup

In [1]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\12015\.wdm\drivers\chromedriver\win32\91.0.4472.19\chromedriver.exe] found in cache


## 2.) Scrape [mars news site](https://redplanetscience.com/) - collect latest news title and paragraph text

In [3]:
# Use splinter browser variable to navigate to the mars news site
url = 'https://redplanetscience.com/'
browser.visit(url)

In [4]:
# Scrape using BeautifulSoup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# Use div with class 'content_title' to find list of all news titles
article_titles = soup.find_all('div', class_='content_title')

# Pull first one to get latest title
latest_article_title = article_titles[0].text

print(latest_article_title)

# Use div with class 'article_teaser_body' to find list of news paragraph text
article_paragraphs = soup.find_all('div', class_='article_teaser_body')

# Pull first one to get latest article paragraph
latest_article_paragraph = article_paragraphs[0].text

print(latest_article_paragraph)


Celebrate Mars Reconnaissance Orbiter's Views From Above
Marking its 15th anniversary since launch, one of the oldest spacecraft at the Red Planet has provided glimpses of dust devils, avalanches, and more.


## 3.) Scrape [JPL mars space images site](https://spaceimages-mars.com/) - collect featured image path

In [5]:
# Use splinter browser variable to navigate to the mars news site
url = 'https://spaceimages-mars.com/'
browser.visit(url)

In [6]:
# Scrape using BeautifulSoup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# Use div with class 'content_title' to find list of all news titles
featured_image = soup.find('img', class_='headerimage')

# Pull first one to get latest title
featured_image_url =  url + featured_image['src']

print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars2.jpg


## 4.) Scrape [mars fact site](https://galaxyfacts-mars.com/) - use pandas to scrable facts table

In [7]:
# Define url for pandas to scrape
url = 'https://galaxyfacts-mars.com/'

In [8]:
# Read in tables from url defined above
tables = pd.read_html(url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [9]:
# Convert list returned above into dataframe
table_df = tables[0]
table_df

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [10]:
# Rename columns and reset index
table_df.columns = table_df.iloc[0]
table_df = table_df.drop(table_df.index[0])
# table_df = table_df.set_index('Mars - Earth Comparison')
table_df

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [11]:
# Output scraped dataframe to html string
table_html_string = table_df.to_html(index=False).replace('\n', '')
table_html_string

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars - Earth Comparison</th>      <th>Mars</th>      <th>Earth</th>    </tr>  </thead>  <tbody>    <tr>      <th></th>      <td>Diameter:</td>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th></th>      <td>Mass:</td>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th></th>      <td>Moons:</td>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th></th>      <td>Distance from Sun:</td>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th></th>      <td>Length of Year:</td>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th></th>      <td>Temperature:</td>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>'

In [12]:
# Validate html string by exporting to html file and opening in browser
table_df.to_html('table.html', index=False)

## 4.) Scrape [mars hemispheres site](https://marshemispheres.com/) - scrape all full resolution image paths

In [13]:
# Use splinter browser variable to navigate to the mars news site
url = 'https://marshemispheres.com/'
browser.visit(url)

In [14]:
# Scrape using BeautifulSoup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# Create empty list to store hemisphere image urls
hemisphere_image_urls = []

# USE BELOW FOR CLICKING - BELOW WORKS
image_links = browser.find_by_css('a.product-item h3')

for i in range(0,len(image_links)-1):
    # Reset variable to avoid "StaleReferenceException"
    # Approach adapted from https://www.selenium.dev/exceptions/#stale_element_reference
    image_to_click = browser.find_by_css('a.product-item h3')[i]
    # Pull title and remove 'Enhanced' from title name
    title = image_to_click.text
    title_final = title.replace(" Enhanced","")
    # Use splinter to navigate to the a tag destination, then scrape for the full resolution image path
    image_to_click.click()
    # Now that we are on new page, reset html and soup variables
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Scrape the full resolution image path
    image_path = soup.find('img',class_='wide-image')['src']
    # Put title and image path in dictionary, and add to hemisphere_image_urls list created above
    image_url_dict = {'title':title_final,'img_url':url + image_path}
    hemisphere_image_urls.append(image_url_dict)
    # Navigate back to original page to scrape the next image
    browser.links.find_by_partial_text('Back').click()
    time.sleep(1)

# Ensure that output dictionary was created appropriately
print(hemisphere_image_urls)




[{'title': 'Cerberus Hemisphere', 'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'}, {'title': 'Syrtis Major Hemisphere', 'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'}, {'title': 'Valles Marineris Hemisphere', 'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]
