In [1]:
# import dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pymongo

## Step 1 Scraping

In [2]:
# initializing dictionary that will hold all info so I can test it here 
# before it goes into py file
full_dict = {}

### NASA Mars News
* Scrape the [Mars News Site](https://redplanetscience.com/) and collect the latest News Title and Paragraph Text. 
* Assign the text to variables that you can reference later.

In [3]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [/Users/juliebaker/.wdm/drivers/chromedriver/mac64/92.0.4515.107/chromedriver] found in cache


In [4]:
# direct splinter to website
mars_news = 'https://redplanetscience.com/'
browser.visit(mars_news)

In [5]:
# Create a Beautiful Soup object
soup = bs(browser.html, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [6]:
title = soup.find('div', class_="content_title")
article_title = title.get_text()
print(article_title)

All About the Laser (and Microphone) Atop Mars 2020, NASA's Next Rover


In [7]:
teaser = soup.find('div', class_="article_teaser_body")
article_teaser = teaser.get_text()
print(article_teaser)

SuperCam is a rock-vaporizing instrument that will help scientists hunt for Mars fossils.


In [8]:
# add items to dictionary
full_dict.update({'article_title': article_title, 'article_teaser': article_teaser})
full_dict

{'article_title': "All About the Laser (and Microphone) Atop Mars 2020, NASA's Next Rover",
 'article_teaser': 'SuperCam is a rock-vaporizing instrument that will help scientists hunt for Mars fossils.'}

In [9]:
browser.quit()

### JPL Mars Space Images - Featured Image

* Visit the url for the Featured Space Image site [here](https://spaceimages-mars.com).

* Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called `featured_image_url`.

* Make sure to find the image url to the full size `.jpg` image.

* Make sure to save a complete url string for this image.

In [10]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [/Users/juliebaker/.wdm/drivers/chromedriver/mac64/92.0.4515.107/chromedriver] found in cache


In [11]:
# direct splinter to website
image_url = 'https://spaceimages-mars.com/'
browser.visit(image_url)

In [12]:
# pull info 
html = browser.html
soup_2 = bs(html, 'html.parser')

In [13]:
# look at html to determine where image is
print(soup_2.prettify)

<bound method Tag.prettify of <html class=""><head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"/>
<!-- <link rel="stylesheet" type="text/css" href="css/font.css"> -->
<link href="css/app.css" rel="stylesheet" type="text/css"/>
<link href="https://stackpath.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
<title>Space Image</title>
<style type="text/css">.fancybox-margin{margin-right:15px;}</style></head>
<body>
<div class="header">
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="#"><img id="logo" src="image/nasa.png"/><span class="logo">Jet Propulsion Laboratory</span>
<span class="logo1">California Institute of Technology</span></a>
<button aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-target="#navbarNav" data-

In [14]:
# why is this coming back with 1? when I click on the site myself it says 2 as does the homework itself
pic_results = soup_2.find('img', class_="headerimage fade-in").get('src')
print(pic_results)

image/featured/mars1.jpg


In [15]:
feature_name = soup_2.find('h1', class_='media_feature_title').text
print(feature_name)

Dusty Space Cloud


In [16]:
feature_image_url = image_url + pic_results
print(feature_image_url)

https://spaceimages-mars.com/image/featured/mars1.jpg


In [17]:
full_dict.update({'feature_name': feature_name, 'feature_image_url': feature_image_url})
full_dict

{'article_title': "All About the Laser (and Microphone) Atop Mars 2020, NASA's Next Rover",
 'article_teaser': 'SuperCam is a rock-vaporizing instrument that will help scientists hunt for Mars fossils.',
 'feature_name': 'Dusty Space Cloud',
 'feature_image_url': 'https://spaceimages-mars.com/image/featured/mars1.jpg'}

In [18]:
browser.quit()

### Mars Facts

* Visit the Mars Facts webpage [here](https://galaxyfacts-mars.com) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

* Use Pandas to convert the data to a HTML table string.

In [19]:
# set url code to a variable
table_url = "https://galaxyfacts-mars.com/"

In [20]:
# pandas extracts the tables
tables = pd.read_html(table_url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [21]:
# choose only the 2nd table because we don't need the comparisons to earth
just_mars_df = tables[1]
just_mars_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [22]:
# convert to html
mars_html_table = just_mars_df.to_html()
mars_html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 ( Phobos &amp; Deimos )</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n   

In [23]:
mars_table_strip = mars_html_table.replace('\n', '')
mars_table_strip

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>0</th>      <th>1</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 ( Phobos &amp; Deimos )</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>5</th>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>6</th>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>7</th>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>8</th>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    

In [24]:
# add to dictionary
full_dict.update({'table_url': table_url, 'mars_html_table': mars_table_strip})
full_dict

{'article_title': "All About the Laser (and Microphone) Atop Mars 2020, NASA's Next Rover",
 'article_teaser': 'SuperCam is a rock-vaporizing instrument that will help scientists hunt for Mars fossils.',
 'feature_name': 'Dusty Space Cloud',
 'feature_image_url': 'https://spaceimages-mars.com/image/featured/mars1.jpg',
 'table_url': 'https://galaxyfacts-mars.com/',
 'mars_html_table': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>0</th>      <th>1</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 ( Phobos &amp; Deimos )</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)

### Mars Hemispheres

* Visit the astrogeology site [here](https://marshemispheres.com/) to obtain high resolution images for each of Mar's hemispheres.

* You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys `img_url` and `title`.

* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.


In [37]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [/Users/juliebaker/.wdm/drivers/chromedriver/mac64/92.0.4515.107/chromedriver] found in cache


In [38]:
# assign url variables for main pages
cerb_url = 'https://marshemispheres.com/cerberus.html'
schia_url = 'https://marshemispheres.com/schiaparelli.html'
syrt_url = 'https://marshemispheres.com/syrtis.html'
valles_url = 'https://marshemispheres.com/valles.html'
generic_url = 'https://marshemispheres.com/'

In [39]:
# url list to iterate through the loop
hemi_urls = [cerb_url, schia_url, syrt_url, valles_url]
hemi_urls

['https://marshemispheres.com/cerberus.html',
 'https://marshemispheres.com/schiaparelli.html',
 'https://marshemispheres.com/syrtis.html',
 'https://marshemispheres.com/valles.html']

In [40]:
# initialize lists
hemi_titles = []
hemi_pics = []

In [41]:
# establish list which will hold hemisphere dictionaries
mars_hemis_dict = []

In [43]:
# loop
for url in hemi_urls:
    # direct splinter to website
    this_url = url
    browser.visit(this_url)

    #create a beautiful soup object
    this_soup = bs (browser.html, 'html.parser')

    # pull info for lists
    this_title = this_soup.find('h2', class_="title").get_text()
    hemi_titles.append('image_title: ' + this_title)

    # stripping end of url
    this_img = this_soup.find_all('a', href=True)
    this_pic = (this_img[3]['href'])
    this_pic_url = generic_url + this_pic
    hemi_pics.append('hemi_img_url: ' + (generic_url + this_pic))
    
    # putting into sub-dictionary
    this_dict = {'title': this_title, 'img_url': this_pic_url}
    mars_hemis_dict.append(this_dict)
    

browser.quit()    

In [45]:
mars_hemis_dict

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]

In [46]:
hemi_pics

['hemi_img_url: https://marshemispheres.com/images/full.jpg',
 'hemi_img_url: https://marshemispheres.com/images/full.jpg',
 'hemi_img_url: https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
 'hemi_img_url: https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
 'hemi_img_url: https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg']

In [47]:
hemi_titles

['image_title: Cerberus Hemisphere Enhanced',
 'image_title: Cerberus Hemisphere Enhanced',
 'image_title: Schiaparelli Hemisphere Enhanced',
 'image_title: Syrtis Major Hemisphere Enhanced',
 'image_title: Valles Marineris Hemisphere Enhanced']

In [48]:
mars_hemis_dict

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]

In [49]:
full_dict.update({'mars_hemis': mars_hemis_dict})
full_dict

{'article_title': "All About the Laser (and Microphone) Atop Mars 2020, NASA's Next Rover",
 'article_teaser': 'SuperCam is a rock-vaporizing instrument that will help scientists hunt for Mars fossils.',
 'feature_name': 'Dusty Space Cloud',
 'feature_image_url': 'https://spaceimages-mars.com/image/featured/mars1.jpg',
 'table_url': 'https://galaxyfacts-mars.com/',
 'mars_html_table': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>0</th>      <th>1</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 ( Phobos &amp; Deimos )</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)

In [50]:
print(hemi_pics)

['https://marshemispheres.com/images/full.jpg', 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg', 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg', 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg']


In [32]:
print(hemi_titles)

['image_title:Cerberus Hemisphere Enhanced', 'image_title:Schiaparelli Hemisphere Enhanced', 'image_title:Syrtis Major Hemisphere Enhanced', 'image_title:Valles Marineris Hemisphere Enhanced']


In [50]:
# start connection to mongo db
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# definte the 'mars_db' database in Mongo
db = client.mars_db

# declare the collection
mars_collection = db.mars 

# insert the dictionary
mars_collection.insert_one(full_dict)

<pymongo.results.InsertOneResult at 0x11dec3180>

In [54]:
# start connection to mongo db FOR 2nd OPTION
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# definte the 'mars_db' database in Mongo
db = client.mars_db2

# declare the collection
mars_collection2 = db.mars2 

# insert the dictionary
mars_collection2.insert_one(dict_2)

<pymongo.results.InsertOneResult at 0x1194a3d40>