In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from datetime import datetime
import config as cfg
import tweepy as tw
import json

In [2]:
# Constants
mars_sites = [{"Name":"NASA Mars Explorer News",
               "URL":"https://mars.nasa.gov/news",
               "Type":"News",
               "Link Stem":"https://mars.nasa.gov"
              },
              {"Name":"JPL Mars Images",
               "URL":"https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars",
               "Type":"Featured Image",
               "Link Stem":"https://www.jpl.nasa.gov"
              },
              {"Name":"Mars Weather",
               "URL":"https://twitter.com/marswxreport?lang=en",
               "Type":"Weather",
               "Account":"MarsWxReport"
              },
              {"Name":"Mars Facts",
               "URL":"https://space-facts.com/mars/",
               "Type":"Facts"
              },
              {"Name":"Mars Hemispheres",
               "URL":"https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars",
               "Type":"Hemispheres",
               "Link Stem":"https://astrogeology.usgs.gov"
              }]

In [3]:
def get_page(url):
    executable_path = {"executable_path": cfg.chromedriver_path}
    browser = Browser("chrome", **executable_path, headless=False)
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    browser.quit()
    return soup

In [6]:
def scrape_news(site):
    print("News Article")
    print(site["Name"])
    print("____________")
    executable_path = {"executable_path": cfg.chromedriver_path}
    browser = Browser("chrome", **executable_path, headless=False)
    browser.visit(site['URL'])
    html = browser.html
    news_soup = BeautifulSoup(html, "html.parser")
    browser.quit()
    #news_soup = get_page(site['URL'])
    latest_date = datetime(2000, 1, 1, 0, 0)
    article_url = ""
    article_title = ""
    article_description = ""
    articles = news_soup.find_all("div",class_ = "list_text")
    for article in articles:
        date_text = article.find("div",class_="list_date").text
        article_date = datetime.strptime(date_text,'%B %d, %Y')
        if article_date > latest_date:
            latest_date = article_date
            print(article_date)
            article_link = article.find("div",class_="content_title")
            article_url = f"{site['Link Stem']}{article_link.a['href']}"
            print(article_url)
            article_title = article_link.a.text.replace('\n','').strip()
            print(article_title)
            description = article.find("div", class_ = "article_teaser_body")
            article_description = description.text.strip()
            print(article_description)
    print("____________")
    detail = {}
    detail["Detail"] = article_title
    detail["Detail URL"] = article_url
    detail["Detail Description"] = article_description
    return detail

# scrape_news test
print(scrape_news(mars_sites[0]))

News Article
NASA Mars Explorer News
____________
2019-11-15 00:00:00
https://mars.nasa.gov/news/8551/mars-scientists-investigate-ancient-life-in-australia/
Mars Scientists Investigate Ancient Life in Australia
Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions.
____________
{'Detail': 'Mars Scientists Investigate Ancient Life in Australia', 'Detail URL': 'https://mars.nasa.gov/news/8551/mars-scientists-investigate-ancient-life-in-australia/', 'Detail Description': "Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions."}


In [30]:
def scrape_featured_image(site):
    print("Featured Image")
    print("____________")
    featured_soup = get_page(site['URL'])
    article = {}
    image = featured_soup.find("article",class_="carousel_item")
    print(image["alt"])
    image_style = image["style"]
    image_link = image_style[image_style.find("'")+1:]
    image_link = image_link[:image_link.find("'")]
    print(image_link)
    article["Detail"] = image["alt"]
    article["Detail URL"] = f"{site['Link Stem']}{image_link}"
    article["Detail Description"] = "Jet Propulsion Laboratory Featured Image"
    return article

# scrape_featured_image test
print(scrape_featured_image(mars_sites[1]))

Featured Image
____________
Cassiopeia A: Death Becomes Her
/spaceimages/images/wallpaper/PIA03519-1920x1200.jpg
{'Detail': 'Cassiopeia A: Death Becomes Her', 'Detail URL': 'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA03519-1920x1200.jpg', 'Detail Description': 'Jet Propulsion Laboratory Featured Image'}


In [11]:
def get_Twitter_API():
    consumer_key = cfg.Twitter_Consumer_API_Key
    consumer_secret = cfg.Twitter_Consumer_Secret_API_Key
    access_token = cfg.Twitter_Access_Token
    access_token_secret = cfg.Twitter_Access_Token_Secret
    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret) 
    api = tw.API(auth)
    return api

In [29]:
def scrape_weather(site):
    print("Mars Weather")
    print("____________")
    # Use API to get latest Mars Weather tweet
    api = get_Twitter_API()
    twitter_account = site["Account"]
    status = api.user_timeline(twitter_account,count=1,page=1)
    #json_str = json.dumps(status[0]._json)
    #parsed = json.loads(json_str)
    #print(json.dumps(parsed, indent=4, sort_keys=True))
    entities = status[0].entities
    urls = dict(entities["urls"][0])
    last_tweet_url = ""
    for key in urls.keys():
        if key == "expanded_url":
            last_tweet_url = urls[key]
    print(last_tweet_url)
    # Scrape text of latest tweet
    weather_soup = get_page(last_tweet_url)
    tweet = weather_soup.find("div",class_="js-tweet-text-container")
    tweet_text = tweet.p.text.replace('\n','').strip()
    tweet_text = tweet_text[:(tweet_text.find("hPapic.twitter.com")-1)]
    print(tweet_text)
    article = {}
    article["Detail"] = tweet_text
    article["Detail URL"] = last_tweet_url
    article["Detail Description"] = "Mars Weather Tweet"
    return article

# scrape_weather test
print(scrape_weather(mars_sites[2]))

Mars Weather
____________
https://twitter.com/i/web/status/1196244436252078080
InSight sol 346 (2019-11-16) low -101.5ºC (-150.8ºF) high -23.5ºC (-10.3ºF)winds from the SSE at 4.8 m/s (10.8 mph) gusting to 20.0 m/s (44.7 mph)pressure at 6.80
{'Detail': 'InSight sol 346 (2019-11-16) low -101.5ºC (-150.8ºF) high -23.5ºC (-10.3ºF)winds from the SSE at 4.8 m/s (10.8 mph) gusting to 20.0 m/s (44.7 mph)pressure at 6.80', 'Detail URL': 'https://twitter.com/i/web/status/1196244436252078080', 'Detail Description': 'Mars Weather Tweet'}


In [31]:
def scrape_facts(site):
    print("Mars Facts")
    print("____________")
    facts_soup = get_page(site['URL'])
    table = facts_soup.find("table",class_="tablepress tablepress-id-p-mars")
    table_body = table.tbody
    columns = table_body.find_all("td")
    descriptions = []
    values = []
    col_num = 0
    for column in columns:
        column_text = column.text
        if (col_num % 2) == 0:
            column_text = column_text[0:(len(column_text)-1)]
            descriptions.append(column_text)
        else:
            values.append(column_text)
        col_num += 1
    facts = pd.DataFrame()
    facts["Description"] = descriptions
    facts["Values"] = values
    facts = facts.set_index("Description")
    print(facts)
    facts_table = facts.to_dict()
    article = {}
    article["Detail"] = facts_table["Values"]
    article["Detail URL"] = site['URL']
    article["Detail Description"] = "Mars Facts"
    return article

# scrape_facts test
print(scrape_facts(mars_sites[3]))

Mars Facts
____________
                                             Values
Description                                        
Equatorial Diameter                        6,792 km
Polar Diameter                             6,752 km
Mass                  6.39 × 10^23 kg (0.11 Earths)
Moons                           2 (Phobos & Deimos)
Orbit Distance             227,943,824 km (1.38 AU)
Orbit Period                   687 days (1.9 years)
Surface Temperature:                   -87 to -5 °C
First Record                      2nd millennium BC
Recorded By                    Egyptian astronomers
{'Detail': {'Equatorial Diameter': '6,792 km', 'Polar Diameter': '6,752 km', 'Mass': '6.39 × 10^23 kg (0.11 Earths)', 'Moons': '2 (Phobos & Deimos)', 'Orbit Distance': '227,943,824 km (1.38 AU)', 'Orbit Period': '687 days (1.9 years)', 'Surface Temperature:': '-87 to -5 °C', 'First Record': '2nd millennium BC', 'Recorded By': 'Egyptian astronomers'}, 'Detail URL': 'https://space-facts.com/mars/', 'Det

In [14]:
def scrape_hemispheres(site):
    print("Mars Hemispheres")
    print("____________")
    hemispheres_soup = get_page(site['URL'])
    items = hemispheres_soup.find_all("div",class_="item")
    link_stem = site["Link Stem"]
    hemispheres = []
    for item in items:
        hemisphere = {}
        hemisphere["Name"] = item.img["alt"].split(" ")[0]
        hemisphere_page = f"{link_stem}{item.a['href']}"
        hemisphere["Page"] = hemisphere_page
        hemisphere_soup = get_page(hemisphere_page)
        hemisphere_image = hemisphere_soup.find("img",class_="wide-image")
        hemisphere["Image"] = f"{link_stem}{hemisphere_image['src']}"
        hemispheres.append(hemisphere)
    article = {}
    article["Detail"] = hemispheres
    article["Detail URL"] = site["URL"]
    article["Detail Description"] = "Mars Hemispheres"
    return article

# scrape_hemispheres test
print(scrape_hemispheres(mars_sites[4]))

Mars Hemispheres
____________
{'Detail': [{'Name': 'Cerberus', 'Page': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'Image': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'}, {'Name': 'Schiaparelli', 'Page': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'Image': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'}, {'Name': 'Syrtis', 'Page': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'Image': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'}, {'Name': 'Valles', 'Page': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced', 'Image': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}], 'Detail URL': 'https://astrogeolo

In [32]:
def get_post(site):
    post = {}
    post.update({"Site":site["Name"]})
    url = site["URL"]
    post["Site URL"] = url
    site_type = site["Type"]
    post["Site Type"] = site_type
    details = {}
    if site['Type'] == "News":
        details = scrape_news(site)
    elif site['Type'] == "Featured Image":
        details = scrape_featured_image(site)
    elif site['Type'] == "Weather":
        details = scrape_weather(site)
    elif site['Type']=="Facts":
        details = scrape_facts(site)
    else:
        details = scrape_hemispheres(site)
    print(f"{site['Name']} Details:")
    print(details)
    print()
    post["Detail"] = details["Detail"]
    post["Detail URL"] = details["Detail URL"]
    post["Detail Description"] = details["Detail Description"]
    return post
          
# test get_post
print(get_post(mars_sites[0]))

News Article
NASA Mars Explorer News
____________
2019-11-15 00:00:00
https://mars.nasa.gov/news/8551/mars-scientists-investigate-ancient-life-in-australia/
Mars Scientists Investigate Ancient Life in Australia
Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions.
____________
NASA Mars Explorer News Details:
{'Detail': 'Mars Scientists Investigate Ancient Life in Australia', 'Detail URL': 'https://mars.nasa.gov/news/8551/mars-scientists-investigate-ancient-life-in-australia/', 'Detail Description': "Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions."}

{'Site': 'NASA Mars Explorer News', 'Site URL': 'https://mars.nasa.gov/news', 'Site Type': 'News', 'Detail': 'Mars Scientists Investigate Ancient Life in Australia', 'Detail URL': 'https://mars.nasa.gov/news/8551/

In [33]:
def scrape_sites():
    sites = []
    for site in mars_sites:
        post = get_post(site)
        sites.append(post)
    return sites

# test scrape_sites
print(scrape_sites())

News Article
NASA Mars Explorer News
____________
2019-11-15 00:00:00
https://mars.nasa.gov/news/8551/mars-scientists-investigate-ancient-life-in-australia/
Mars Scientists Investigate Ancient Life in Australia
Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions.
____________
NASA Mars Explorer News Details:
{'Detail': 'Mars Scientists Investigate Ancient Life in Australia', 'Detail URL': 'https://mars.nasa.gov/news/8551/mars-scientists-investigate-ancient-life-in-australia/', 'Detail Description': "Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions."}

Featured Image
____________
Cassiopeia A: Death Becomes Her
/spaceimages/images/wallpaper/PIA03519-1920x1200.jpg
JPL Mars Images Details:
{'Detail': 'Cassiopeia A: Death Becomes Her', 'Detail URL': 'https://www.jp

In [34]:
# Insert Mars site into MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.mars_db
db.mars.drop()
posts = scrape_sites()
x = db.mars.insert_many(posts)
print(x.inserted_ids)

News Article
NASA Mars Explorer News
____________
____________
NASA Mars Explorer News Details:
{'Detail': '', 'Detail URL': '', 'Detail Description': ''}

Featured Image
____________
Cassiopeia A: Death Becomes Her
/spaceimages/images/wallpaper/PIA03519-1920x1200.jpg
JPL Mars Images Details:
{'Detail': 'Cassiopeia A: Death Becomes Her', 'Detail URL': 'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA03519-1920x1200.jpg', 'Detail Description': 'Jet Propulsion Laboratory Featured Image'}

Mars Weather
____________
https://twitter.com/i/web/status/1196244436252078080
InSight sol 346 (2019-11-16) low -101.5ºC (-150.8ºF) high -23.5ºC (-10.3ºF)winds from the SSE at 4.8 m/s (10.8 mph) gusting to 20.0 m/s (44.7 mph)pressure at 6.80
Mars Weather Details:
{'Detail': 'InSight sol 346 (2019-11-16) low -101.5ºC (-150.8ºF) high -23.5ºC (-10.3ºF)winds from the SSE at 4.8 m/s (10.8 mph) gusting to 20.0 m/s (44.7 mph)pressure at 6.80', 'Detail URL': 'https://twitter.com/i/web/status/1196244436

In [35]:
# Verify insertion
items = db.mars.find()
for item in items:
    print(item['Site'])

NASA Mars Explorer News
JPL Mars Images
Mars Weather
Mars Facts
Mars Hemispheres
