In [None]:
# Import packages.
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import json

mongo_dict = {}

## Get NASA Mars News.

In [None]:
# Scrape data from nasa.gov API.
news_url = "https://mars.nasa.gov/api/v1/news_items/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
resp = requests.get(news_url, headers=header)
resp = resp.json()


In [None]:
# Check out the response.
print(resp.keys())
print(resp)

In [None]:
# Extract relevant data.
articles = resp.get('items')
articles

In [None]:
article_df = pd.DataFrame.from_records(articles)
article_df.head()

In [None]:
# Store title and content for later.
titles = article_df['title']
content = article_df['body']
titles_content = article_df[['title','body']]

## Get JPL Mars Space Images - Featured Image.

In [None]:
# Image url.
image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    
# Use splinter to navigate the site and find the image url for the current Featured 
# Mars Image and assign the url string to a variable called featured_image_url.
    
executable_path = {'executable_path': '/Users/jenniferwilson/Desktop/Repositories/UDEN201811DATA3-Homework/chromedriver'}
browser = Browser('chrome', **executable_path)

browser.visit(image_url)
html = browser.html
soup = bs(html, 'html.parser')

# How are these different
#soup.find_all("article",{"class":"carousel_item"})[0]
#soup.find_all("article",{"class":"carousel_item"})[0].contents

In [None]:
# From soup object above, get featured image URL.
featured_image_url = "https://www.jpl.nasa.gov" + soup.article["style"].split("'")[1]
featured_image_url

## Get Mars Weather.

In [None]:
twitter_url = "https://twitter.com/marswxreport?lang=en"
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
resp = requests.get(twitter_url, headers=header)
soup = bs(resp.text,'html.parser')

In [None]:
# Extract first tweet.
tweet_text = soup.find("p",{"class":"TweetTextSize"}).get_text()

# Remove ASCII characters.
tweet_text = tweet_text.encode("ascii","ignore")

# Split based on line breaks.
mars_weather = tweet_text.splitlines()

mars_weather

# Output should look like this:
#InSight sol 88 (2019-02-25) low -95.2ºC (-139.4ºF) high -17.8ºC (0.0ºF)
#winds from the SW at 5.2 m/s (11.6 mph) gusting to 19.8 m/s (44.3 mph)
#pressure at 7.20 hPa


## Mars Facts

In [None]:
# use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
def get_facts():
    facts_url = "http://space-facts.com/mars/"
    facts = pd.read_html(facts_url)
    facts_df = facts[0]
    facts_df.rename(columns={0: "Stat", 1:"Values"}, inplace = True) 
    facts_df['Values'] = map(lambda x: x.encode('ascii', 'ignore').decode('ascii'), facts_df["Values"])
    facts_html = facts_df.to_html()
    

In [None]:
# Use Pandas to convert the data to a HTML table string.
# https://stackoverflow.com/questions/3206344/passing-html-to-template-using-flask-jinja2
facts_html = facts_df.to_html()
print(facts_html)

## Mars Hemispheres

In [None]:
# Scrape starting page for the URLs to each hemisphere page.
image_page_urls = []

def get_hemisphere_urls():
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    resp = requests.get(url, headers=header)
    soup = bs(resp.text,'html.parser')
    hrefs = soup.find_all("a", {"class": "itemLink product-item"})
    
    for i in range(len(hrefs)):
        href_trimmed = hrefs[i]['href'].encode('utf-8')
        image_page_urls.append("https://astrogeology.usgs.gov/" + href_trimmed)

get_hemisphere_urls()
print(image_page_urls)


In [None]:
url = "https://astrogeology.usgs.gov//search/map/Mars/Viking/cerberus_enhanced"
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
resp = requests.get(url, headers=header)
soup = bs(resp.text,'html.parser')

image_url = soup.find_all("a", attrs={"target": "_blank"})
image_url = image_url[1]["href"]

title = soup.find('h2', {"class":"title"}).get_text().encode('utf-8')

        

In [None]:
hemisphere_image_urls = []

def get_hemisphere_image():
    for i in range(len(image_page_urls)):
        url = image_page_urls[i]        
        header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        resp = requests.get(url, headers=header)
        soup = bs(resp.text,'html.parser')
        
        image_url = soup.find_all("a", attrs={"target": "_blank"})
        image_url = image_url[1]["href"].encode('utf-8')
        title = soup.find('h2', {"class":"title"}).get_text().encode('utf-8')
        
        hemisphere_image_urls.append({
            "img_url": image_url,
            "title": title,
        })
        
get_hemisphere_image()



In [None]:
print(hemisphere_image_urls)

## Putting it together into one dictionary.

In [4]:
# Import packages.
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import json
import re

mongo_dict = {
    "news":"",
    "featured_image":"",
    "weather":"",
    "facts":"",
    "images":""
}

In [5]:
# Get NASA Mars News and store results as a list of dictionaries.

def get_articles():
    news = []
    
    news_url = "https://mars.nasa.gov/api/v1/news_items/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    resp = requests.get(news_url, headers=header)
    resp = resp.json()

    # Get articls
    articles = resp.get('items')
    
    # Iterate over all elements in articles to extract title and body.
    for i in range(len(articles)):
        # Store title.
        title = articles[i]['title'].encode('utf-8')

        # Store body.
        body = articles[i]['body'].encode('utf-8')
        body = re.sub(r'<.*?>', '', body)
        
        
        news.append({
            "title": title,
            "body": body,
        })
        
        print(news)
        #mongo_dict.update({"news":UPDATE})

get_articles()
        

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Get JPL Mars Space Images - Featured Image.
featured_image = []

def get_featured_image():
    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"    
    executable_path = {'executable_path': '/Users/jenniferwilson/Desktop/Repositories/UDEN201811DATA3-Homework/chromedriver'}
    browser = Browser('chrome', **executable_path)
    browser.visit(image_url)
    html = browser.html
    soup = bs(html, 'html.parser')
    featured_image_url = "https://www.jpl.nasa.gov" + soup.article["style"].split("'")[1].encode('utf-8')
    featured_image.append({"featured_image_url": featured_image_url})
    
get_featured_image()


In [None]:
# Get Mars weather.
weather = []

def get_mars_weather():
    twitter_url = "https://twitter.com/marswxreport?lang=en"
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    resp = requests.get(twitter_url, headers=header)
    soup = bs(resp.text,'html.parser')

    # Extract first tweet.
    tweet_text = soup.find("p",{"class":"TweetTextSize"}).get_text()

    # Remove ASCII characters.
    tweet_text = str(tweet_text.encode("ascii","ignore"))
    
    weather.append({"weather": tweet_text})
    
get_mars_weather()


In [None]:
# Get Mars facts.
facts = []

def get_facts():
    facts_url = "http://space-facts.com/mars/"
    facts = pd.read_html(facts_url)
    facts_df = facts[0]
    facts_df.rename(columns={0: "Stat", 1:"Values"}, inplace = True) 
    facts_df['Values'] = map(lambda x: x.encode('ascii', 'ignore').decode('ascii'), facts_df["Values"])
    facts_html = facts_df.to_html()
    print(facts_html)
    facts.append({"facts:":facts_html})

get_facts()
facts

# TUTOR - Doesn't work.

In [None]:
## Get hemisphere images.

image_page_urls = []

def get_hemisphere_urls():
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    resp = requests.get(url, headers=header)
    soup = bs(resp.text,'html.parser')
    hrefs = soup.find_all("a", {"class": "itemLink product-item"})
    
    for i in range(len(hrefs)):
        href_trimmed = hrefs[i]['href'].encode('utf-8')
        image_page_urls.append("https://astrogeology.usgs.gov/" + href_trimmed)

get_hemisphere_urls()

hemisphere_image_urls = []

def get_hemisphere_image():
    for i in range(len(image_page_urls)):
        url = image_page_urls[i]        
        header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        resp = requests.get(url, headers=header)
        soup = bs(resp.text,'html.parser')
        
        image_url = soup.find_all("a", attrs={"target": "_blank"})
        image_url = image_url[1]["href"].encode('utf-8')
        title = soup.find('h2', {"class":"title"}).get_text().encode('utf-8')
        
        hemisphere_image_urls.append({
            "img_url": image_url,
            "title": title,
        })
        
get_hemisphere_image()

