In [28]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import time

In [2]:
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

In [72]:
def scrape(url):
    browser = init_browser()

    url = url #"https://mars.nasa.gov/news/"
    browser.visit(url)
    
    time.sleep(5)

    html = browser.html
    browser.quit()
    soup = BeautifulSoup(html, "html.parser")
    
    return soup

## Mars News Scrape

In [164]:
url = 'https://mars.nasa.gov/news/'

In [165]:
soup = scrape(url)

In [178]:
news = soup.find_all('div',class_='content_title')

# Get the first story that has a link 
for story in news:
    if story.find('a'):
        firstNews = story.find('a').get_text()
        break
        
teaser = soup.find('div',class_='article_teaser_body').get_text()
firstNews, teaser

("NASA's Curiosity Mars Rover Snaps Its Highest-Resolution Panorama Yet",
 'To go along with the stunning 1.8-billion-pixel image, a new video offers a sweeping view of the Red Planet.')

## JPL Image URL

In [50]:
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [51]:
soup = scrape(url)

In [55]:
imgLinkSuffix = soup.find("a", class_="button fancybox")['data-fancybox-href']

In [56]:
featuredImgURL = 'https://www.jpl.nasa.gov' + imgLinkSuffix

In [57]:
featuredImgURL

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17794_ip.jpg'

## Mars Weather Tweets

In [179]:
url = 'https://twitter.com/marswxreport?lang=en'

In [180]:
soup = scrape(url)

In [181]:
tweetHolder = soup.find_all('article', role='article')

In [182]:
# The fifth elemenet of the list of spans should always be the tweet text of the latest tweet
#
# Also we need to make sure it's a tweet about weather, which isn't a guarantee.
# Weather tweets seem to start with "InSight" so we check the first element of the text
# If it's indeed 'InSight', we set the text to our variable and break the for loop of tweets

for tweet in tweetHolder:   
    if (tweet.find_all('span')[4].get_text().split(" ")[0] == 'InSight'):
        tweetText = tweet.find_all('span')[4].get_text()
        break

In [184]:
tweetText = tweetText.replace('\n', ", ")

In [185]:
tweetText

'InSight sol 451 (2020-03-03) low -93.6ºC (-136.4ºF) high -10.3ºC (13.4ºF), winds from the SSE at 6.3 m/s (14.2 mph) gusting to 19.9 m/s (44.5 mph), pressure at 6.30 hPa'

## Pandas Scraping

In [142]:
url = 'https://space-facts.com/mars/'

In [143]:
tables = pd.read_html(url)
df = tables[0]

In [151]:
df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [161]:
df = df.rename(columns={0: 'Description', 1:'Value'})\
                .set_index('Description')
del df.index.name

htmlTable = df.to_html()

In [163]:
htmlTable

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

## Mars Hemispheres

In [117]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [139]:
def click_around(url):
    
    # Create empty list
    hemisphere_image_urls = []
    
    browser = init_browser()

    url = url #"https://mars.nasa.gov/news/"
    browser.visit(url)
    
    time.sleep(5)
    
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    
    for hemElem in soup.find_all('h3'):
        
        hemDict = {}
        
        hemName = hemElem.get_text().rstrip('Enhanced')

        browser.click_link_by_partial_text(hemName)
        
        time.sleep(1)
        
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        
        downloads = soup.find('div',class_='downloads')
        imgURL = downloads.find('a')['href']
        
        hemDict['title'] = hemName 
        hemDict['img_url'] = imgURL
        
        hemisphere_image_urls.append(hemDict)
        
        browser.back()
        
        time.sleep(1)
    
    return hemisphere_image_urls
    
    #soup = BeautifulSoup(html, "html.parser")
    
    browser.quit()

In [140]:
hemisphere_image_urls = click_around(url)



In [141]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere ',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere ',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere ',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere ',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]