In [76]:
# ------------------------------------------------------------
#  Step 1: Import all required modules and initialize all tools
# ------------------------------------------------------------
from bs4 import BeautifulSoup as soup
from splinter import Browser
import pandas as pd
import aux_func as aux

# initialize splinter browser
browser = Browser('chrome', 
                  **{"executable_path": "/usr/local/bin/chromedriver"}, 
                  headless=False)

In [73]:
# ------------------------------------------------------------
#  Step 2: Scrape Nasa Mars News website for recent headlines
#  with headlines, dates, and content preview
# ------------------------------------------------------------
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
webpage = aux.getParsedWebpage(browser, url)

# create lists to hold info
headline_list = []
date_list = []
text_list = []

# pull the most recent headlines from the website
headlines_grouped = soup.find_all(webpage, 'h3', class_=None)
[headline_list.append(headline.get_text()) for headline in headlines_grouped]

# pull the dates for the most recent headlines
dates_grouped = soup.find_all(webpage, 'div', class_='list_date')
[date_list.append(date.get_text()) for date in dates_grouped]

# pull the text of the most recent headlines
text_grouped = soup.find_all(webpage, 'div', class_='article_teaser_body')
[text_list.append(text.get_text()) for text in text_grouped]

[]

In [3]:
# ------------------------------------------------------------
#  Step 3: Scrape Nasa Mars News website for featured image
# ------------------------------------------------------------
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
webpage = aux.getParsedWebpage(browser, url)

# get title and description
featured_title = soup.find(webpage, 'h1', class_='media_feature_title').get_text()
featured_description = soup.find(webpage, 'a', class_='button fancybox').get('data-description')

# get and construct url for largest size of featured image available
featured_url = soup.find(webpage, 'a', class_='button fancybox').get('data-fancybox-href')
featured_filename = featured_url.split('/')[4].split('_')[0]
featured_url = f'https://www.jpl.nasa.gov/spaceimages/images/largesize/{featured_filename}_hires.jpg'

In [4]:
# ------------------------------------------------------------
#  Step 4: Scrape Mars Twitter for the most recent weather 
#  update
# ------------------------------------------------------------
url = 'https://twitter.com/marswxreport?lang=en'
webpage = aux.getParsedWebpage(browser, url)

std_tweet_class = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'

# pull text of most recent tweet about the weather
recent_weather = soup.find_all(webpage, 'p', class_= std_tweet_class)[0].get_text()

# create split string to pull apart and add to dataframe
recent_weather_split = recent_weather.split(',')
recent_weather_split = [i.split(' ') for i in recent_weather_split][2:]

# create dictionary to turn into a dataframe
weather_dict = {'mars_date':f'{recent_weather.split("(")[0]}',
                'earth_date':f'{recent_weather.split("(")[1].split(")")[0]}',
                'cur_weather':f'{recent_weather_split[0][1]}',
                'temp_high':f'{recent_weather_split[1][2]}',
                'temp_low':f'{recent_weather_split[2][2]}',
                'pressure':f'{recent_weather_split[3][3]} {recent_weather_split[3][4]}',
                'daylight':f'{recent_weather_split[4][2]}'}
weather_df = pd.DataFrame.from_dict(weather_dict, orient='index')
weather_df = weather_df.rename(columns={0:'Most Recent Weather on Mars'})
weather_df

Unnamed: 0,Most Recent Weather on Mars
mars_date,Sol 2029
earth_date,"April 21, 2018"
cur_weather,Sunny
temp_high,-11C/12F
temp_low,-72C/-97F
pressure,7.22 hPa
daylight,05:25-17:21


In [35]:
# ------------------------------------------------------------
#  Step 5: Scrape Space Facts website for data on Mars
# ------------------------------------------------------------
url = 'https://space-facts.com/mars/'
webpage = aux.getParsedWebpage(browser, url)


# create dict to hold facts
fact_dict = {}

# get all rows in the facts table and parse into dict
facts_all = soup.find(webpage, 
                      'table', 
                      class_='tablepress tablepress-id-mars').find_all('tr')
for fact in facts_all:
    fact_dict[soup.find(fact, 'strong').get_text()] = (soup.find(fact, class_='column-2').get_text())

# convert to Dataframe and to HTML table
fact_df = pd.DataFrame.from_dict(fact_dict, orient='index')
fact_df.rename(columns={0:'Facts about Mars'}, inplace=True)
fact_html = pd.DataFrame.to_html(fact_df)

fact_df

Unnamed: 0,Facts about Mars
Equatorial Diameter:,"6,792 km\n"
Polar Diameter:,"6,752 km\n"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)\n
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [78]:
# ------------------------------------------------------------
#  Step 6: Scrape images and titles from Astrogeology site
# ------------------------------------------------------------
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
webpage = aux.getParsedWebpage(browser, url)

# store the base link for the page
base_url = 'https://astrogeology.usgs.gov/'

# get all the links to the photo pages first
page_links_list = []
page_links = soup.find_all(webpage, 'a', class_='itemLink product-item')
[page_links_list.append(page.get('href'))for page in page_links]

# remove duplicates and change back to list
page_links_list = set(page_links_list)
page_links_list = list(page_links_list)

# iterate through links and pull URL for full size images
#for link in
print(page_links_list)

['/search/map/Mars/Viking/schiaparelli_enhanced', '/search/map/Mars/Viking/valles_marineris_enhanced', '/search/map/Mars/Viking/syrtis_major_enhanced', '/search/map/Mars/Viking/cerberus_enhanced']


In [37]:
print(webpage.prettify())

<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <script async="" src="https://ssl.google-analytics.com/ga.js" type="text/javascript">
  </script>
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js" type="text/javascript">
  </script>
  <title>
   Astropedia Search Results | USGS Astrogeology Science Center
  </title>
  <meta content="USGS Astrogeology Science Center Astropedia search results." name="description"/>
  <meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRo