In [137]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from datetime import datetime
import config as cfg
import tweepy as tw
import json

In [44]:
# Constants
chrome_path = "C:/Users/janin/OneDrive/Documents/GitHub/chromedriver.exe"
mars_sites = [{"Name":"NASA Mars Explorer News",
               "URL":"https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest",
               "Type":"News",
               "Link Stem":"https://mars.nasa.gov"
              },
              {"Name":"JPL Mars Images",
               "URL":"https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars",
               "Type":"Featured Image",
               "Link Stem":"https://www.jpl.nasa.gov"
              },
              {"Name":"Mars Weather",
               "URL":"https://twitter.com/marswxreport?lang=en",
               "Type":"Weather"
              },
              {"Name":"Mars Facts",
               "URL":"https://space-facts.com/mars/",
               "Type":"Facts"
              },
              {"Name":"Mars Hemispheres",
               "URL":"https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars",
               "Type":"Hemispheres"
              }]

In [45]:
def init_browser():
    executable_path = {"executable_path": chrome_path}
    return Browser("chrome", **executable_path, headless=False)

In [46]:
def get_page(url):
    browser = init_browser()
    browser.visit(url)
    html = browser.html
    return BeautifulSoup(html, "html.parser")

In [47]:
def scrape_news(site):
    print("News Article")
    print("____________")
    soup = get_page(site['URL'])
    article = {}
    articles = soup.find_all("div",class_ = "list_text")
    latest_date = datetime(2000, 1, 1, 0, 0)
    article_url = ""
    article_title = ""
    description = ""
    for article in articles:
        date_text = article.find("div",class_="list_date").text
        article_date = datetime.strptime(date_text,'%B %d, %Y')
        if article_date > latest_date:
            latest_date = article_date
            print(article_date)
            article_link = article.find("div",class_="content_title")
            article_url = f"{site['Link Stem']}{article_link.a['href']}"
            print(article_url)
            article_title = article_link.a.text.replace('\n','').strip()
            print(article_title)
            description = article.find("div", class_ = "article_teaser_body")
            article_description = description.text.strip()
            print(article_description)
    print("____________")
    article["Detail"] = article_title
    article["Detail URL"] = article_url
    article["Detail Description"] = article_description
    return article
            

In [48]:
def scrape_featured_image(site):
    print("Featured Image")
    print("____________")
    soup = get_page(site['URL'])
    article = {}
    image = soup.find("article",class_="carousel_item")
    print(image["alt"])
    image_style = image["style"]
    image_link = image_style[image_style.find("'")+1:]
    image_link = image_link[:image_link.find("'")]
    print(image_link)
    article["Detail"] = image["alt"]
    article["Detail URL"] = f"{site['Link Stem']}{image_link}"
    article["Detail Description"] = "Jet Propulsion Laboratory Featured Image"
    return article

In [247]:
def get_Mars_weather():
    print("Mars Weather")
    print("____________")
    # Use API to get latest Mars Weather tweet
    consumer_key = cfg.Twitter_Consumer_API_Key
    consumer_secret = cfg.Twitter_Consumer_Secret_API_Key
    access_token = cfg.Twitter_Access_Token
    access_token_secret = cfg.Twitter_Access_Token_Secret
    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret) 
    api = tw.API(auth)
    status = api.user_timeline("MarsWxReport",count=1,page=1)
    #json_str = json.dumps(status[0]._json)
    #parsed = json.loads(json_str)
    #print(json.dumps(parsed, indent=4, sort_keys=True))
    entities = status[0].entities
    urls = dict(entities["urls"][0])
    last_tweet_url = ""
    for key in urls.keys():
        if key == "expanded_url":
            last_tweet_url = urls[key]
    print(last_tweet_url)
    # Scrape text of latest tweet
    soup = get_page(last_tweet_url)
    tweet = soup.find("div",class_="js-tweet-text-container")
    tweet_text = tweet.p.text.replace('\n','').strip()
    tweet_text = tweet_text[:(tweet_text.find("hPapic.twitter.com")-1)]
    print(tweet_text)
    article = {}
    article["Detail"] = tweet_text
    article["Detail URL"] = last_tweet_url
    article["Detail Description"] = "Mars Weather Tweet"
    return article

In [248]:
def get_post(site):
    post = {}
    post.update({"Site":site["Name"]})
    url = site["URL"]
    post["Site URL"] = url
    site_type = site["Type"]
    post["Site Type"] = site_type
    details = {}
    if site['Type'] == "News":
        details = scrape_news(site)
    elif site['Type'] == "Featured Image":
        details = scrape_featured_image(site)
    elif site['Type'] == "Weather":
        details = get_Mars_weather()
    post["Detail"] = details["Detail"]
    post["Detail URL"] = details["Detail URL"]
    post["Detail Description"] = details["Detail Description"]
    return post

In [249]:
site = mars_sites[2]
post = get_post(site)
print(post)
print()

Mars Weather
____________
https://twitter.com/i/web/status/1196063234924699649
InSight sol 345 (2019-11-15) low -100.4ºC (-148.6ºF) high -23.9ºC (-11.1ºF)winds from the SSE at 5.4 m/s (12.0 mph) gusting to 20.2 m/s (45.3 mph)pressure at 6.80
{'Site': 'Mars Weather', 'Site URL': 'https://twitter.com/marswxreport?lang=en', 'Site Type': 'Weather', 'Detail': 'InSight sol 345 (2019-11-15) low -100.4ºC (-148.6ºF) high -23.9ºC (-11.1ºF)winds from the SSE at 5.4 m/s (12.0 mph) gusting to 20.2 m/s (45.3 mph)pressure at 6.80', 'Detail URL': 'https://twitter.com/i/web/status/1196063234924699649', 'Detail Description': 'Mars Weather Tweet'}



In [None]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.mars_db
collection = db.items
for site in mars_sites:
    response = requests.get(site["URL"])
    soup = BeautifulSoup(response.text, 'lxml')
    results = soup.find_all('li', class_=site["Class"])
    for result in results:
        