# Step 1: Scraping

In [1]:
# Dependencies
import requests
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd
from IPython.display import Image
from IPython.core.display import HTML, Image, display
from scrape_mars import scraper

In [15]:
# Create beautiful soup object from html, create a function
def scrape(url):
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    return soup

### NASA Mars News

* Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [3]:
# Save URL, use scrape function created and save as variable name
url_news = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
soup = scrape(url_news)

# Collect latest news title & paragraph text
latest_news = soup.find("ul", class_="item_list").find("div",class_ = "content_title").a.text
latest_paragraph = soup.find("ul", class_="item_list").find("div",class_ = "article_teaser_body").text

print(f'''
The most recent article is: "{latest_news}"
The most recent paragraph is: "{latest_paragraph}"
''')


The most recent article is: "NASA Social Media and Websites Win Webby Awards "
The most recent paragraph is: "NASA's social media presence, the InSight mission social media accounts, NASA.gov and SolarSystem.NASA.gov will be honored at the 2019 Webby Awards - "the Oscars of the Internet.""



### JPL Mars Space Images - Featured Image

* Visit the url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).

* Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called `featured_image_url`.

* Make sure to find the image url to the full size `.jpg` image.

* Make sure to save a complete url string for this image.

In [4]:
# Save urls as variables
url_base = "https://www.jpl.nasa.gov"
url_featured = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

# Use function created to scrape site and find targets of scraped object
image_soup = scrape(url_featured)
image_url = image_soup.find("div", class_="carousel_container").find("article", class_="carousel_item")\
.find('a')['data-fancybox-href']

# Image url output is only the path after "url", so must append to base url
# example: /spaceimages/images/mediumsize/PIA09113_ip.jpg
featured_image_url = f'{url_base}{image_url}'
print(featured_image_url)

# Display image
display(Image(url=featured_image_url))

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA19092_ip.jpg


### Mars Weather

* Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`.

In [7]:
# Save urls as variable
url_twitter = "https://twitter.com/marswxreport?lang=en"

# Use function created to scrape site and find targets of scraped object
twitter_soup = scrape(url_twitter)

# Display to preview soup object
#print(twitter_soup.prettify())

Use soup object to locate the most recent tweet with the weather
mars_weather = (twitter_soup.find("div", class_="js-tweet-text-container")\
                .find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text)

print(f'''
The current weather on mars is:
{mars_weather}
''')


The current weather on mars is:
InSight sol 144 (2019-04-23) low -98.7ºC (-145.7ºF) high -17.ºC (0.4ºF)
winds from the SW at 4.2 m/s (9.5 mph) gusting to 11.1 m/s (24.8 mph)
pressure at 7.40 hPapic.twitter.com/ZbFNWx1Eq



### Mars Facts

* Visit the Mars Facts webpage [here](http://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

* Use Pandas to convert the data to a HTML table string.

In [16]:
# Save url as variable
fact_url = "https://space-facts.com/mars/"

# Use function created to scrape site and find targets of scraped object
fact_soup = scrape(fact_url)

# Create table as beautiful soup object 
table = fact_soup.find("table").find("tbody").find_all("tr")

# Iterate and print elements
for t in table:
    cells = t.find_all("td")
    param = cells[0].get_text()
    data = cells[1].get_text()
    print(f'{param}{data}')

Equatorial Diameter:6,792 km

Polar Diameter:6,752 km

Mass:6.42 x 10^23 kg (10.7% Earth)
Moons:2 (Phobos & Deimos)
Orbit Distance:227,943,824 km (1.52 AU)
Orbit Period:687 days (1.9 years)

Surface Temperature: -153 to 20 °C
First Record:2nd millennium BC
Recorded By:Egyptian astronomers


In [17]:
# Another option: pandas df using pd.read_html which automatically finds tables and converts to df
mars_df = pd.read_html(fact_url)
mars_facts_df = pd.DataFrame(mars_df[0])

# Name columns and set index
mars_facts_df.columns = ['Parameter','Data']
mars_df_table = mars_facts_df.set_index("Parameter")
mars_df_table

Unnamed: 0_level_0,Data
Parameter,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [18]:
# Convert the pd df to HTML table and clean up. 
mars_html_table = mars_df_table.to_html(classes='marsdata')
mars_table = mars_html_table.replace('\n', ' ')

type(mars_table) # str
print(mars_table)

<table border="1" class="dataframe marsdata">   <thead>     <tr style="text-align: right;">       <th></th>       <th>Data</th>     </tr>     <tr>       <th>Parameter</th>       <th></th>     </tr>   </thead>   <tbody>     <tr>       <th>Equatorial Diameter:</th>       <td>6,792 km</td>     </tr>     <tr>       <th>Polar Diameter:</th>       <td>6,752 km</td>     </tr>     <tr>       <th>Mass:</th>       <td>6.42 x 10^23 kg (10.7% Earth)</td>     </tr>     <tr>       <th>Moons:</th>       <td>2 (Phobos &amp; Deimos)</td>     </tr>     <tr>       <th>Orbit Distance:</th>       <td>227,943,824 km (1.52 AU)</td>     </tr>     <tr>       <th>Orbit Period:</th>       <td>687 days (1.9 years)</td>     </tr>     <tr>       <th>Surface Temperature:</th>       <td>-153 to 20 °C</td>     </tr>     <tr>       <th>First Record:</th>       <td>2nd millennium BC</td>     </tr>     <tr>       <th>Recorded By:</th>       <td>Egyptian astronomers</td>     </tr>   </tbody> </table>


### Mars Hemispheres

* Visit the USGS Astrogeology site [here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.

* You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys `img_url` and `title`.

* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [19]:
# Save url as variable
hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

# Use function created to scrape site and find targets of scraped object
hemi_soup = scrape(hemi_url)

# Establish base url to prefix to links
base_url = "https://astrogeology.usgs.gov"

# Create object from 
item = hemi_soup.find_all("div", class_="item")

In [20]:
# Create list to append urls
hemi_dicts = []

# Loop through object, find link, append to list, add base url, and grab title
for i in item:
    link = i.find(class_="description").a["href"]
    full_url = (base_url + link)
    scraped = scrape(full_url)
    img_url = scraped.find("div", class_="downloads").find("li").a["href"]
    title = (i.find(class_="description").h3.text).replace(" Enhanced", "")
    hemi_dicts.append({"Title": title, "Image Url":img_url})

# Display list of dictionaries
hemi_dicts

[{'Title': 'Cerberus Hemisphere',
  'Image Url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'Title': 'Schiaparelli Hemisphere',
  'Image Url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'Title': 'Syrtis Major Hemisphere',
  'Image Url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'Title': 'Valles Marineris Hemisphere',
  'Image Url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [21]:
# Display images
for dict in hemi_dicts:
    display(Image(url=dict.get("Image Url", "")))

# Step 2 - MongoDB and Flask Application

Use MongoDB with Flask templating to create a new HTML page that displays all of the information that was scraped from the URLs above.

* Start by converting your Jupyter notebook into a Python script called [scrape_mars.py](https://github.com/hollybergen/Data_Science_Projects/blob/master/Mission%20to%20Mars%20-%20Web-Scraping%20%26%20Document%20Databases/scrape_mars.py) with a function called `scrape` that will execute all of your scraping code from above and return one Python dictionary containing all of the scraped data.

See: [scrape_mars.py](https://github.com/hollybergen/Data_Science_Projects/blob/master/Mission%20to%20Mars%20-%20Web-Scraping%20%26%20Document%20Databases/scrape_mars.py)

In [4]:
# Preview the scraper function created
print(scraper())

{'latest_news': 'NASA Social Media and Websites Win Webby Awards ', 'latest_paragraph': 'NASA\'s social media presence, the InSight mission social media accounts, NASA.gov and SolarSystem.NASA.gov will be honored at the 2019 Webby Awards - "the Oscars of the Internet."', 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA18614_ip.jpg', 'mars_weather': 'InSight sol 144 (2019-04-23) low -98.7ºC (-145.7ºF) high -17.6ºC (0.4ºF)\nwinds from the SW at 4.2 m/s (9.5 mph) gusting to 11.1 m/s (24.8 mph)\npressure at 7.40 hPapic.twitter.com/ZbFNWx1Eq6', 'mars_table': '<table border="1" class="dataframe marsdata">   <thead>     <tr style="text-align: right;">       <th></th>       <th>Data</th>     </tr>     <tr>       <th>Parameter</th>       <th></th>     </tr>   </thead>   <tbody>     <tr>       <th>Equatorial Diameter:</th>       <td>6,792 km</td>     </tr>     <tr>       <th>Polar Diameter:</th>       <td>6,752 km</td>     </tr>     <tr>       <th>Mass:</th>     

* Next, create a route called `/scrape` that will import your `scrape_mars.py` script and call your `scrape` function.

  * Store the return value in Mongo as a Python dictionary.

* Create a root route `/` that will query your Mongo database and pass the mars data into an HTML template to display the data.

See: [app.py](https://github.com/hollybergen/Data_Science_Projects/blob/master/Mission%20to%20Mars%20-%20Web-Scraping%20%26%20Document%20Databases/app.py)

* Create a template HTML file called [index.html](https://github.com/hollybergen/Data_Science_Projects/blob/master/Mission%20to%20Mars%20-%20Web-Scraping%20%26%20Document%20Databases/templates/index.html) that will take the mars data dictionary and display all of the data in the appropriate HTML elements. Use the following as a guide for what the final product should look like, but feel free to create your own design.

See: [index.html](https://github.com/hollybergen/Data_Science_Projects/blob/master/Mission%20to%20Mars%20-%20Web-Scraping%20%26%20Document%20Databases/templates/index.html)