Importing dependencies

In [27]:
import pymongo
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import requests

Scraping for the latest Mars-related headline and associated teaser

In [28]:
#scrape the website https://mars.nasa.gov/news/ to scrape data on most recent headline and teaser associated with the headline.

url = 'https://mars.nasa.gov/news/'
response = requests.get(url)
bs_news= bs(response.text, 'html.parser')

#print(bs_news.prettify())

title_news = bs_news.find(class_='content_title').text.strip()
teaser_news = bs_news.find(class_='rollover_description').text.strip()

print(f'Title of the first news article is: "{title_news}"')
print(f'Article teaser: "{teaser_news}"')

Title of the first news article is: "NASA Ingenuity Mars Helicopter Prepares for First Flight"
Article teaser: "Now uncocooned from its protective carbon-fiber shield, the helicopter is being readied for its next steps."


Scraping for the for the featured Mars image of the day

In [29]:
#Next, we will use splinter to scrape the featured image on the website https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html

#setup splinter
executable_path={'executable_path': ChromeDriverManager().install()}
browser=Browser('chrome',**executable_path, headless=False)

#below, the url is separated into two parts in order to concatonate the "url" with the image path later.
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/'
index_url='index.html'
browser.visit(url+index_url)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\kqo2\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache




In [30]:
#after inspecting the page, we see that the featured image is in the class "headerimage fade-in" as a src link within this img tag.

#use beautifulSoup to scrape the image link
html=browser.html
ftimg_soup = bs(html, 'html.parser')
ftimg_url = soup.find('img',class_="headerimage fade-in")
ftimg_url=ftimg_url.attrs['src'] #learned about the attrs function here: https://towardsdatascience.com/soup-of-the-day-97d71e6c07ec

#concatonate the image link with the earlier url link
featured_image_url=url+ftimg_url
browser.visit(featured_image_url)

#quit the browser session for splinter.
browser.quit()

Scraping for Mars facts

In [35]:
#Next, we will use the pandas library to scrape a table of Mars facts
facts_url="https://space-facts.com/mars/"
tables=pd.read_html(facts_url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:     -87 to -5 °C      -88 to 58°C,
           

In [59]:
#after inspect the tables, we see that the first table (index=0) contains the information we'd like to include in our site
df=tables[0]
df.columns=['Variable', 'Mars Data'] #used this code to change the indexed columns: https://note.nkmk.me/en/python-pandas-dataframe-rename/
df

Unnamed: 0,Variable,Mars Data
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [60]:
df.to_html('table.html')

Scraping for the images of Mars' four hemispheres

In [116]:
#setup splinter again
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

#below, the url is separated into two parts in order to concatonate the "url" with the image path later.
url='https://astrogeology.usgs.gov/'
hems_url = 'search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url+hems_url)


[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\kqo2\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache




In [118]:
#inspected the website to see that all the info we need for each of the hemsipheres (the link as well as the name) is located within the class "item".
html = browser.html
soup = bs(html, 'html.parser')
sidebar = soup.find_all(class_='item')

#create a url list to get the urls to the image and name for each of hte four hemispheres (within each of the "item" classes)
url_list=[]
for x in sidebar:
    hem_url = x.find('a')['href']
    url_list.append(url+hem_url)
print(url_list)

['https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg']


In [None]:
#we then create a list to visit each of the urls from the list created above, and in this list we are collecting each of the image paths for each of the four hemisphere urls and storing those image paths into a list.
hemimgs_list=[]
for x in url_list:
    browser.visit(x)
    html = browser.html
    soup = bs(html, 'html.parser')
    sidebar=soup.find('li')
    categories = sidebar.find('a')['href']
    hemimgs_list.append(categories)
print(hemimgs_list)

In [115]:
#we then create a list to visit each of the urls from the url_list created above, and in this list we are collecting each of the hemisphere titles for each of the four hemisphere urls (removing the extra space and word "Enhanced") and storing those titles into a list.
hemtitles_list=[]
for x in url_list:
    browser.visit(x)
    html = browser.html
    soup = bs(html, 'html.parser')
    sidebar3=soup.find('h2',class_="title")
    hem_title=sidebar3.text.strip().replace(' Enhanced','') #use this replace code to remove the " Enhanced" part of the hemisphere title.
    hemtitles_list.append(hem_title)
print(hemtitles_list)

['Cerberus Hemisphere', 'Schiaparelli Hemisphere', 'Syrtis Major Hemisphere', 'Valles Marineris Hemisphere']


In [125]:
#finally, create a dictionary with title and img_url keys.
hemisphere_image_urls=[]
for i in range(len(hemimgs_list)):
    hemisphere_image_urls.append({"title":hemtitles_list[i], "img_url":hemimgs_list[i]})
print(hemisphere_image_urls)


[{'title': 'Cerberus Hemisphere', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]
