In [1]:
#Import dependencies for web scraping: BeautifulSoup, Splinter, Pandas
from bs4 import BeautifulSoup
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pymongo
import pandas as pd
import time

In [3]:
#Set up splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\jrose\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache






In [4]:
#Connect to MongoDB
mongo_conn = 'mongodb://localhost:27017'
client=pymongo.MongoClient(mongo_conn)

# Nasa Mars News:

In [5]:
#Set up url path
url = 'https://mars.nasa.gov/news/'

In [6]:
#Access url path
response = requests.get(url)

In [7]:
#Create a BeautifulSoup object to parse html
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
#Use the inspect tool within the browser to find tag and class associated with news title content
news_title = soup.find('div', class_='content_title').find('a').text
print(news_title)


NASA's Perseverance Drives on Mars' Terrain for First Time



In [9]:
#Use the inspect tool within the browser to find tag and class associated with news paragraph content
news_p = soup.find('div', class_="rollover_description_inner").text
print(news_p)


The first trek of the agency’s largest, most advanced rover yet on the Red Planet marks a major milestone before science operations get under way.



In [10]:
#Generate database and collection to MongoDB
client.mars_db.mars.insert_one({'news_title': news_title,
                               'news_p':news_p})

<pymongo.results.InsertOneResult at 0x2def09ea980>

# JPL Mars Space Images - Featured Image

In [36]:
#Set up splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)







[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\jrose\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


In [37]:
#Scrape JPL Featured Image
image_urlpath='https://www.jpl.nasa.gov/images?search=&category=Mars'
browser.visit(image_urlpath)
time.sleep(1)

In [38]:
#Use splinter and find by css method
browser.find_by_css('.BaseImage').click()

In [39]:
html=browser.html
soup = BeautifulSoup(html, 'html.parser')

In [40]:
#This works within JNB, transfers to MongoDB, displays correctly through app.py but errors for scraping
featured_img = soup.find_all('img', class_='BaseImage')
featured_img_url =(featured_img[0]['src'])

In [43]:
print(featured_img_url)

https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24505.width-1024.jpg


In [44]:
#Connect to established MongoDB to insert into collection
client.mars_db.mars.insert_one({'news_title': news_title,
                                'news_p':news_p,
                                'featured_image_url': featured_img_url})

<pymongo.results.InsertOneResult at 0x2def0f93980>

# Mars Facts

In [45]:
#Use pandas to parse mars facts page
mars_facts_url = 'https://space-facts.com/mars/'

In [46]:
#Read into html with pandas
table_list = pd.read_html(mars_facts_url)
len(table_list)

3

In [47]:
#Convert data to a HTML table string
for each_table in table_list:
    display(each_table.head())

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"


Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days


Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"


In [48]:
#Select one of the tables to save and clean table for 
mars_df = table_list[0]
mars_df.columns = ['Mars Planet Profile', 'Facts']
clean_mars = mars_df.drop(0)
cleaner_mars = clean_mars.set_index('Mars Planet Profile')
cleaner_mars

Unnamed: 0_level_0,Facts
Mars Planet Profile,Unnamed: 1_level_1
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [49]:
#Export as a .html file
cleaner_mars.to_html('marsfacts', index=True)

# Mars Hemispheres

In [50]:
#Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\jrose\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache






In [51]:
#Setup BeautifulSoup
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

browser.visit(url)
html=browser.html
soup = BeautifulSoup(html, 'html.parser')

In [52]:
#Use inspection tool to find tag and class associated with image
mars_hemis_info = soup.find_all('div', class_= 'item')

In [53]:
#Set up a for loop to find all hemisphere images and the titles
img_urls = []

for x in range(len(mars_hemis_info)):
    html = browser.find_by_tag('h3')
    html[x].click()
    
    img_html = browser.html
    soup = BeautifulSoup(img_html, 'html.parser')
    
    main_hemis_url = 'https://astrogeology.usgs.gov'
    full_img_url = soup.find('img', class_='wide-image')['src']
    fin_url = main_hemis_url + full_img_url
    title = browser.find_by_css('.title').text
    img_urls.append({'title': title,
                         'img_url': fin_url})
    
    browser.back()
browser.quit()

In [54]:
#Verify that title and image data accessed is correct
img_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [55]:
#Setup to save into MongoDB

# hemisphere_image_urls = [
#     {"title": "Valles Marineris Hemisphere", "img_url": "..."},
#     {"title": "Cerberus Hemisphere", "img_url": "..."},
#     {"title": "Schiaparelli Hemisphere", "img_url": "..."},
#     {"title": "Syrtis Major Hemisphere", "img_url": "..."},
# ]

In [58]:
#Connect to established MongoDB to insert into collection
client.mars_db.mars.insert_one({'news_title': news_title,
                                'news_p':news_p,
                                'featured_image_url': featured_img_url,
                                'mars_hemispheres': img_urls})

<pymongo.results.InsertOneResult at 0x2def0ff7b40>

In [57]:
# #Save to MongoDB
# client.mars_db.mars.insert_one({'mars_hemispheres': img_urls})