In [2]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import pymongo
import time
import pandas as pd

In [3]:
# open browser window using splinter 
def init_browser():
    executable_path = {"executable_path": r"C:/chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)
browser = init_browser()

# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [109]:
# Define database and collection
db = client.mission_to_mars_db
collection = db.items
db.items.delete_many({})

<pymongo.results.DeleteResult at 0x24438474438>

# Scraping

## NASA Mars News
Note: the content (list of news) is parsed using Splinter as using just page source with BeautifulSoup returns narrower and not the latest results

In [73]:
# open page with splinter
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(url)
# wait for page content to be generated
time.sleep(5)



In [75]:
# get into b/s, get list of elements to extract the data from
soup = BeautifulSoup(browser.html, 'lxml')
result=soup.find('div', class_='list_text')
result

<div class="list_text"><div class="list_date">May 23, 2018</div><div class="content_title"><a href="/news/8342/insight-steers-toward-mars/" target="_self">InSight Steers Toward Mars</a></div><div class="article_teaser_body">The spacecraft has completed its first trajectory correction maneuver.</div></div>

In [79]:
#Extract title and teaser into variables
news_title=result.find('div', class_='content_title',recursive=True).text
news_title=news_title.replace('\n', ' ').replace('\r', '').strip()
news_p=result.find('div', class_='article_teaser_body',recursive=True).text
news_p=news_p.replace('\n', ' ').replace('\r', '').strip()
print(news_title)
print(news_p)

InSight Steers Toward Mars
The spacecraft has completed its first trajectory correction maneuver.


## JPL Mars Space Images - Featured Image

In [8]:
# open page with splinter
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=mars'
browser.visit(url)
# click the button opening the featured image
browser.click_link_by_id('full_image')
# wait for page content to be generated
time.sleep(5)
# click the button opening details page for the featured image
browser.click_link_by_partial_text('more info')

In [9]:
soup = BeautifulSoup(browser.html, 'lxml')
results=soup.find_all('div', class_='download_tiff')
results

[<div class="download_tiff">
 <p>Full-Res TIFF: <a href="//photojournal.jpl.nasa.gov/tiff/PIA16227.tif">PIA16227.tif</a></p>
 </div>, <div class="download_tiff">
 <p>Full-Res JPG: <a href="//photojournal.jpl.nasa.gov/jpeg/PIA16227.jpg">PIA16227.jpg</a></p>
 </div>]

In [11]:
for result in results:
    # Error handling
    try:
        # Identify and return URL of jpeg image
      if ('jpg' in result.find('a').text):
        featured_image_url= 'https:'+result.find('a')['href']    
    except Exception as e:
      print(e)
    
featured_image_url

'https://photojournal.jpl.nasa.gov/jpeg/PIA16227.jpg'

## Mars Weather

In [13]:
url='https://twitter.com/marswxreport?lang=en'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# get into b/s, get list of elements to extract the data from
mars_weather=soup.find('p', class_='TweetTextSize').text
mars_weather

'Sol 2058 (May 21, 2018), Sunny, high 4C/39F, low -71C/-95F, pressure at 7.43 hPa, daylight 05:20-17:20'

## Mars Facts

In [40]:
url = 'https://space-facts.com/mars/s'
tables = pd.read_html(url)
tables[0]

Unnamed: 0,0,1
0,Equatorial Diameter:,"120,536 km"
1,Polar Diameter:,"108,728 km"
2,Mass:,5.68 × 10^26 kg (95 Earths)
3,Moons:,"62 (Titan, Enceladus, Iapetus & Rhea)"
4,Rings:,30+ (7 Groups)
5,Orbit Distance:,"1,426,666,422 km (9.54 AU)"
6,Orbit Period:,"10,756 days (29.5 years)"
7,Effective Temperature:,-178 °C
8,First Record:,8th century BC
9,Recorded By:,Assyrians


In [41]:
df = tables[0]
df.columns = ['Parameter', 'Value']
df.set_index('Parameter',inplace=True)
df


Unnamed: 0_level_0,Value
Parameter,Unnamed: 1_level_1
Equatorial Diameter:,"120,536 km"
Polar Diameter:,"108,728 km"
Mass:,5.68 × 10^26 kg (95 Earths)
Moons:,"62 (Titan, Enceladus, Iapetus & Rhea)"
Rings:,30+ (7 Groups)
Orbit Distance:,"1,426,666,422 km (9.54 AU)"
Orbit Period:,"10,756 days (29.5 years)"
Effective Temperature:,-178 °C
First Record:,8th century BC
Recorded By:,Assyrians


In [80]:
mars_facts = df.to_html().replace('\n', '')
mars_facts

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Parameter</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>120,536 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>108,728 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>5.68 × 10^26 kg (95 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>62 (Titan, Enceladus, Iapetus &amp; Rhea)</td>    </tr>    <tr>      <th>Rings:</th>      <td>30+ (7 Groups)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>1,426,666,422 km (9.54 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>10,756 days (29.5 years)</td>    </tr>    <tr>      <th>Effective Temperature:</th>      <td>-178 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>8th\xa0century BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Assyrians</td>    </tr>  </tbody></table>

## Mars Hemisperes

In [55]:
# open page with splinter
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
host= 'https://astrogeology.usgs.gov'
browser.visit(url)
time.sleep(5)

In [56]:
#extract links to Mars Hemispehere images
soup = BeautifulSoup(browser.html, 'lxml')
results=soup.find_all('a', class_='product-item',recursive=True)
results

[<a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/7677c0a006b83871b5a2f66985ab5857_schiaparelli_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><h3>Schiaparelli Hemisphere Enhanced</h3></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/syrtis_major_enhanced"><img alt="Syrtis Major Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/aae41197e40d6d4f3ea557f8cfe51d15_syrtis_major_enhanced.tif_thumb.png"/></a>,

In [71]:
mars_hemispheres=[]
thumb_urls=[]
titles=[]
img_page_urls=[]

from math import trunc
# locate and extract hemisphere's names and a links to pages with full-size images
for i in range(trunc(len(results)/2)):
    # Error handling
    try:
        thumb_url=host + results[2*i].img['src']
        title = results[2*i+1].text         
        img_page_url= host + results[2*i+1]['href']
        # Run only if all is available
        if (title and img_page_url and thumb_url):
            thumb_urls.append(thumb_url)
            titles.append(title)
            img_page_urls.append(img_page_url)
    except Exception as e:
        print(e)

print(thumb_urls)
print(titles)
print(img_page_urls)

['https://astrogeology.usgs.gov/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png', 'https://astrogeology.usgs.gov/cache/images/7677c0a006b83871b5a2f66985ab5857_schiaparelli_enhanced.tif_thumb.png', 'https://astrogeology.usgs.gov/cache/images/aae41197e40d6d4f3ea557f8cfe51d15_syrtis_major_enhanced.tif_thumb.png', 'https://astrogeology.usgs.gov/cache/images/04085d99ec3713883a9a57f42be9c725_valles_marineris_enhanced.tif_thumb.png']
['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']
['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']


In [92]:
# Extract full res image URL from each hemisphere's page
img_urls=[]
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

for img_page_url in img_page_urls:
    browser.visit(img_page_url)
    soup = BeautifulSoup(browser.html, 'lxml')
    result=soup.find('a', text='Original', recursive=True)
    img_urls.append(result['href'])
    browser.visit(url)

In [103]:
# create a dictionary with Mars hemispheres data via a dataframe 
mars_hemispheres_imgs={}
mars_hemispheres_imgs=pd.DataFrame.from_items([('title', titles), ('img_url',img_urls)]).to_dict('records')
mars_hemispheres_imgs

[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [104]:
# create a dictionary with all the data collected
mars_record= {
        'news_title': news_title,
        'news_p': news_p,
        'featured_image_url': featured_image_url,
        'mars_weather': mars_weather,
        'mars_facts': mars_facts,
        'hemisphere_image_urls': mars_hemispheres_imgs
    }
mars_record

{'featured_image_url': 'https://photojournal.jpl.nasa.gov/jpeg/PIA16227.jpg',
 'hemisphere_image_urls': [{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif',
   'title': 'Cerberus Hemisphere Enhanced'},
  {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif',
   'title': 'Schiaparelli Hemisphere Enhanced'},
  {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif',
   'title': 'Syrtis Major Hemisphere Enhanced'},
  {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif',
   'title': 'Valles Marineris Hemisphere Enhanced'}],
 'mars_facts': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Parameter</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>120,536 km</td> 

In [110]:
# Insert data into Mongo DB
db.items.insert_one(mars_record)

<pymongo.results.InsertOneResult at 0x244384745e8>