In [90]:
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
# from requests_html import HTMLSession, AsyncHTMLSession

import pandas as pd

import time
import re

In [119]:
# Set-up for Selenium headless browsing
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

driver = webdriver.Chrome("/usr/local/bin/chromedriver", options=options)

In [101]:
# 1. Mars NASA News
# Tagget url and fake-user agent (don't need this for Selenium to work)
nasa_url = 'https://mars.nasa.gov/news/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}

In [102]:
# Use Selenium + Beautiful Soup to get proper data from the page that implement React.js for dynamic data loading
# Then, get the loaded page source into the Soup object
driver.get(nasa_url)
time.sleep(1)
page_source = driver.page_source

soup = BeautifulSoup(page_source, 'lxml')

In [104]:
# ul.item_list > every li.slide has below information > div.image_and_description_container
# div.list_text
    # div.list_date.get_text() -> posted date
    # div.content_title > a.get_text -> News title
    # div.article_teaser_body.get_text() -> News thumbnail (paragraph)
    # div.content_title > a.get('href') -> News link

    
css_selector = 'ul.item_list > li.slide > div.image_and_description_container'# > div.list_text'    

news_list = soup.select(css_selector)

news_info_list = []
base_url = 'https://mars.nasa.gov'

for news in news_list:
    news_text = news.select_one('div.list_text')
    news_img = news.select_one('div.list_image')
    
    post_date = news_text.select_one('div.list_date').string.strip()
    news_title = news_text.select_one('div.content_title > a').string.strip()
    news_paragraph = news_text.select_one('div.article_teaser_body').string.strip()
    news_link = base_url + news_text.select_one('div.content_title > a').get('href').strip()
    
    news_img_link = base_url + news_img.select_one('img').get('src').strip()
    
    news_info_list.append({"post_date" : post_date,
                           "title" : news_title,
                           "paragraph" : news_paragraph,
                           "news_link" : news_link,
                           "news_image": news_img_link}
                         )

In [106]:
news_info_list[:5]

[{'post_date': 'August 15, 2019',
  'title': "Robotic Toolkit Added to NASA's Mars 2020 Rover",
  'paragraph': "The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover.",
  'news_link': 'https://mars.nasa.gov/news/8503/robotic-toolkit-added-to-nasas-mars-2020-rover/',
  'news_image': 'https://mars.nasa.gov/system/news_items/list_view_images/8503_PIA23319-MAIN-226.jpg'},
 {'post_date': 'August 13, 2019',
  'title': "Space Samples Link NASA's Apollo 11 and Mars 2020",
  'paragraph': "While separated by half a century, NASA's Apollo 11 and Mars 2020 missions share the same historic goal: returning samples to Earth.",
  'news_link': 'https://mars.nasa.gov/news/8502/space-samples-link-nasas-apollo-11-and-mars-2020/',
  'news_image': 'https://mars.nasa.gov/system/news_items/list_view_images/8502_apollo-mars2020-20190809-th.jpg'},
 {'post_date': 'August  9, 2019',
  'title': 'Small Satellite Mission of the Year',
  'paragraph': 'The fi

In [71]:
# 2. Mars JPL
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
driver.get(jpl_url)
time.sleep(2)
driver.find_element_by_id('full_image').click()
time.sleep(2)
driver.find_element_by_partial_link_text('more info').click()
time.sleep(2)
featured_image_url = driver.find_element_by_class_name('main_image').get_attribute('src')
featured_image_desc = driver.find_element_by_class_name('main_image').get_attribute('title')

featured_image = {"image_url": featured_image_url,
                  "description": featured_image_desc}

featured_image

{'image_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA00271_hires.jpg',
 'description': "The northern hemisphere is displayed in this global view of the surface of Venus as seen by NASA's Magellan spacecraft. "}

In [117]:
# 3. Mars tweeter

# For practice purpose, I used soup.select() with css selector, not find() or find_all().

# 1) Get the weather information if a paragraph contains "Sol ###":
# div#timeline > div.stream-container > div.stream > ol#stream-items-id >
# li.js-stream-item stream-item stream-item >
# div.tweet js-stream-tweet js-actionable-tweet js-profile-popup-actionable dismissible-content original-tweet js-original-tweet has-cards has-content
# > div.content > div.js-tweet-text-container > p.string

# 2) Get the dashboard image URL as well as the tweeted date and time:
#  2-1) image
# .... > div.content > div.AdaptiveMediaOuterContainer > div.AdaptiveMedia is-square >
# div.AdaptiveMedia-container > div.AdaptiveMedia-singlePhoto >
# div.AdaptiveMedia-photoContainer js-adaptive-photo > img.get('src')
#  2-2) date and time
# .... > div.content > div.stream-item-header > small.time > a.get('data-original-title')

tweeter_url = 'https://twitter.com/marswxreport?lang=en'
resp = requests.get(tweeter_url, headers=headers) 
soup = BeautifulSoup(resp.text, 'lxml')

# css_selector1 = 'ol#stream-items-id > li.js-stream-item.stream-item.stream-item'
# css_selector2 = 'div.tweet.js-stream-tweet.js-actionable-tweet.js-profile-popup-actionable.dismissible-content.original-tweet.js-original-tweet.has-cards.has-content'
# css_selector3 = 'div.content > div.js-tweet-text-container > p'

# Use :has() pseudo-class for CSS selectr
ordered_list = soup.select('div.content:has(p)') # a list

for list_ in ordered_list:
    
    a_list_of_string = list(list_.p.strings)
    
    if re.search(re.compile('sol \d{3}'), a_list_of_string[0]):
        mars_weather = a_list_of_string[0]

        # tweet date selector
#         css_selector4 = 'div.content > div.stream-item-header > small.time > a > span._timestamp.js-short-timestamp'
        tweet_date = list_.select_one('[class*=js-short-timestamp]').string
         
        # image URL selector
#         css_selector5 = 'div.content > div.AdaptiveMediaOuterContainer > div.AdaptiveMedia.is-square > div.AdaptiveMedia-container > div.AdaptiveMedia-singlePhoto > div.AdaptiveMedia-photoContainer.js-adaptive-photo > img'
        temp_html = list_.select_one('[class~=AdaptiveMedia-singlePhoto]')
        
        if temp_html is not None:
            image_url = temp_html.img.get('src').strip()
        else:
            image_url = ''
        break
        
    else:
        continue
    
tweeter_output_dict = {"weather": mars_weather,
                       "image": image_url,
                       "tweet_at": tweet_date}


In [118]:
tweeter_output_dict

{'weather': 'InSight sol 258 (2019-08-18) low -100.0ºC (-148.1ºF) high -26.2ºC (-15.2ºF)\nwinds from the SSE at 5.3 m/s (11.9 mph) gusting to 16.8 m/s (37.6 mph)\npressure at 7.60 hPa',
 'image': 'https://pbs.twimg.com/media/ECU86NPWsAA95lS.jpg',
 'tweet_at': 'Aug 19'}

In [83]:
# 4. Mars Facts
facts_url = 'https://space-facts.com/mars/'
df_facts = pd.read_html(facts_url)

In [95]:
pd.set_option('display.colheader_justify','left')

In [96]:
df_facts[0]

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-153 to 20 °C,-88 to 58°C


In [79]:
df_facts = df_facts[1].rename(columns={0:'Fact', 1:'Value'})

In [80]:
df_facts

Unnamed: 0,Fact,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [108]:
type(df_facts[0].to_html())

str

In [120]:
# 5. Mars Hemispheres
hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
base_url = 'https://astrogeology.usgs.gov' # to get the full image_url
driver.get(hemisphere_url)

In [121]:
# hemispheres = driver.find_element_by_class_name('collapsible.results').find_elements_by_partial_link_text('Hemisphere Enhanced')

# hemisphere_links = [link.get_attribute('href') for link in driver.find_elements_by_partial_link_text('Hemisphere Enhanced')]
# hemisphere_titles = [link.text for link in driver.find_elements_by_partial_link_text('Hemisphere Enhanced')]

hemispheres = [(link.get_attribute('href'), link.text) for link in driver.find_elements_by_partial_link_text('Hemisphere Enhanced')]
hemisphere_image_urls = []

for link, title  in hemispheres:
    driver.get(link)
    time.sleep(2)
    partial_url = driver.find_element_by_class_name('wide-image').get_attribute('src')
    img_url = partial_url
    
    hemisphere_image_urls.append({"title": title,
                                  "img_url": img_url})
driver.quit()

In [122]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]