In [None]:
# ------------------------------------------------------------
#  Step 1: Import all required modules and initialize all tools
# ------------------------------------------------------------
from bs4 import BeautifulSoup as soup
from splinter import Browser
import pandas as pd

# initialize splinter
browser = Browser('chrome', 
                  **{"executable_path": "/usr/local/bin/chromedriver"}, 
                  headless=False)

In [None]:
# ------------------------------------------------------------
#  Step 2: Scrape Nasa Mars News website for recent headlines
#  with headlines, dates, and content preview
# ------------------------------------------------------------

# set up connection to the webpage
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(url)
html = browser.html
webpage = soup(html, 'html.parser')

# create lists to hold info
headline_list = []
date_list = []
text_list = []

# pull the most recent headlines from the website
headlines_grouped = soup.find_all(webpage, 'h3', class_=None)
[headline_list.append(headline.get_text()) for headline in headlines_grouped]

# pull the dates for the most recent headlines
dates_grouped = soup.find_all(webpage, 'div', class_='list_date')
[date_list.append(date.get_text()) for date in dates_grouped]

# pull the text of the most recent headlines
text_grouped = soup.find_all(webpage, 'div', class_='article_teaser_body')
[text_list.append(text.get_text()) for text in text_grouped]

In [45]:
# ------------------------------------------------------------
#  Step 3: Scrape Nasa Mars News website for featured image
# ------------------------------------------------------------
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
html = browser.html
webpage = soup(html, 'html.parser')

# get title and description
featured_title = soup.find(webpage, 'h1', class_='media_feature_title').get_text()
featured_description = soup.find(webpage, 'a', class_='button fancybox').get('data-description')

# get and construct url for largest size of featured image available
featured_url = soup.find(webpage, 'a', class_='button fancybox').get('data-fancybox-href')
featured_filename = featured_url.split('/')[4].split('_')[0]
featured_url = f'https://www.jpl.nasa.gov/spaceimages/images/largesize/{featured_filename}_hires.jpg'

In [102]:
# ------------------------------------------------------------
#  Step 4: Scrape Mars Twitter for the most recent weather 
#  update
# ------------------------------------------------------------
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
html = browser.html
webpage = soup(html, 'html.parser')

std_tweet_class = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'

# pull text of most recent tweet about the weather
recent_weather = soup.find_all(webpage, 'p', class_= std_tweet_class)[0].get_text()

# create split string to pull apart and add to dataframe
recent_weather_split = recent_weather.split(',')
recent_weather_split = [i.split(' ') for i in recent_weather_split][2:]

# create dictionary to turn into a dataframe
weather_dict = {'mars_date':f'{recent_weather.split("(")[0]}',
                'earth_date':f'{recent_weather.split("(")[1].split(")")[0]}',
                'cur_weather':f'{recent_weather_split[0][1]}',
                'temp_high':f'{recent_weather_split[1][2]}',
                'temp_low':f'{recent_weather_split[2][2]}',
                'pressure':f'{recent_weather_split[3][3]} {recent_weather_split[3][4]}',
                'daylight':f'{recent_weather_split[4][2]}'}
weather_df = pd.DataFrame.from_dict(weather_dict, orient='index')
weather_df = weather_df.rename(columns={0:'Most Recent Weather on Mars'})
weather_df

Unnamed: 0,Most Recent Weather on Mars
mars_date,Sol 2029
earth_date,"April 21, 2018"
cur_weather,Sunny
temp_high,-11C/12F
temp_low,-72C/-97F
pressure,7.22 hPa
daylight,05:25-17:21


In [49]:
print(webpage.prettify())

<!DOCTYPE html>
<html data-scribe-reduced-action-queue="true" lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta charset="utf-8"/>
  <script nonce="">
   !function(){window.initErrorstack||(window.initErrorstack=[]),window.onerror=function(r,i,n,o,t){r.indexOf("Script error.")&gt;-1||window.initErrorstack.push({errorMsg:r,url:i,lineNumber:n,column:o,errorObj:t})}}();
  </script>
  <script id="bouncer_terminate_iframe" nonce="">
   if (window.top != window) {
  window.top.postMessage({'bouncer': true, 'event': 'complete'}, '*');
}
  </script>
  <script id="ttft_boot_data" nonce="">
   window.ttftData={"transaction_id":"00a43f32009e218b.36be5eac9f210166\u003c:00526fbe00d1b0b1","server_request_start_time":1524550832357,"user_id":null,"is_ssl":true,"rendered_on_server":true,"is_tfe":true,"client":"macaw-swift","tfe_version":"tsa_a\/1.0.1\/20180327.1623.34ab2f4","ttft_browser":"chrome"};!function(){function t(t,n){window.ttftData&amp;&amp;!window.ttftData[t]&amp;&amp;(window.tt