In [1]:
# Use MongoDB with Flask templating to create a new HTML page that displays all of the information 
# that was scraped from the URLs list

# import dependencies
# pip install pymongo
from bs4 import BeautifulSoup as soup
import requests
import pymongo

from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import datetime as dt

In [2]:
# connect PyMongo driver for MongoDB Python and dependencies
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# create a route called /scrape that will import your scrape_mars.py script and call your scrape function.

# Create a root route / to query Mongo database and pass the mars data into an HTML template 
# to display the data.

# Create a template HTML file called index.html that will take the mars data dictionary and display 
# all of the data in the appropriate HTML elements. Use the following as a guide for what the 
# final product should look like, but feel free to create your own design.

In [4]:
# containing all of the scraped data.
# Store the return value in Mongo as a Python dictionary

# use scrape all function --Scrape Multiple Pages of a Website
def scrape_all():
    # set up splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    # collect information from news page
    news_title, news_paragraph = scrape_news(browser)
    # build the dictionary using the information from the scrapes
    mars_data = {
        "NewsTitle": news_title,
        "NewsParagraph": news_paragraph,
        "FeaturedImage": scrape_feature_img(browser),
        "Facts": scrape_facts_page(browser),
        "Hemispheres": scrape_hemispheres_page(browser),
        "LastUpdated": dt.datetime.now()
    }
    # stop the webdriver
    browser.quit()
    # display output
    return mars_data

In [5]:
# scrape the mars news page
def scrape_news(browser):
    # Visit the Mars news site
    url = 'https://redplanetscience.com/'
    browser.visit(url)
    
    # set delay for loading the page-- wait_time
    browser.is_element_present_by_css('div.list_text', wait_time=1)
    # Convert the browser html to a soup object
    html = browser.html
    news_soup = soup(html, 'html.parser')
    slide_elem = news_soup.select_one('div.list_text')
    
    # retrieve the title
    news_title = news_soup.find('div', class_='content_title').text
    # retrieve the paragraph from the headline
    news_p = news_soup.find('div', class_='article_teaser_body').text


    # return the title and paragraph information
    return news_title, news_p

In [6]:
# scrape the featured image page
def scrape_feature_img(browser):
    # Visit URL
    url = 'https://spaceimages-mars.com'
    browser.visit(url)


    # Find and click the full image button
    featured_image_url = browser.find_by_tag('button')[1]
    featured_image_url.click()


    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    image_soup = soup(html, 'html.parser')


    # find the image url
    img_url_rel = image_soup.find('img', class_='fancybox-image').get('src')


    # Use the base url to create an absolute url
    img_url = f'https://spaceimages-mars.com/{img_url_rel}'


    # return the image url
    return img_url

In [7]:
# scrap through the facts page
def scrape_facts_page(browser):
    # Visit URL
    url = 'https://galaxyfacts-mars.com/'
    browser.visit(url)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    fact_soup = soup(html, 'html.parser')

    # find the facts location
    facts_location = fact_soup.find('div', class_="diagram mt-4")
    fact_table = facts_location.find('table')

    # create an empty string
    facts = ""

    # add the text to the empty string then return
    facts += str(fact_table)
    return facts

In [8]:
# scrape through the hemispheres pages
def scrape_hemispheres_page(browser):
    # base url
    url = "https://marshemispheres.com/"
    browser.visit(url)

    # Create a list to hold the images and titles
    hemisphere_image_urls = []
    
    # set up the loop to loop through each page
    for i in range(4):
        # hemisphere info dictionary
        hemisphere_info = {}
    
        # find the elements on each loop to avoid a stale element exception
        browser.find_by_css('a.product-item img')[i].click()
    
        # find the Sample image anchor tag and extract the href
        sample = browser.links.find_by_text('Sample').first
        hemisphere_info["img_url"] = sample['href']
    
        # Get Hemisphere title
        hemisphere_info['title'] = browser.find_by_css('h2.title').text
    
        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere_info)
    
        # Finally, we navigate backwards
        browser.back()


    # return the hemisphere url with the titles
    return hemisphere_image_urls

In [9]:
# set up as a flask app
if __name__ == "__main__":
    print(scrape_all())



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [C:\Users\juch0\.wdm\drivers\chromedriver\win32\98.0.4758.80\chromedriver.exe] found in cache


{'NewsTitle': 'My Culture, My Voice', 'NewsParagraph': 'In honor of Hispanic Heritage Month, Christina Hernandez, an instrument engineer on the Mars 2020 mission, talks about her childhood and journey to NASA.', 'FeaturedImage': 'https://spaceimages-mars.com/image/featured/mars1.jpg', 'Facts': '<table class="table">\n<tbody>\n<tr>\n<th scope="row"><b> Mars - Earth Comparison</b></th>\n<td><span class="orange"><b> Mars</b></span></td>\n<td><span class="purple"> <b>Earth </b></span> </td>\n</tr>\n<tr>\n<th scope="row">Diameter:</th>\n<td><span class="orange">6,779 km</span></td>\n<td><span class="purple">12,742 km</span> </td>\n</tr>\n<tr>\n<th scope="row">Mass:</th>\n<td><span class="orange">6.39 × 10^23 kg </span></td>\n<td><span class="purple">5.97 × 10^24 kg</span> </td>\n</tr>\n<tr>\n<th scope="row">Moons:</th>\n<td><span class="orange">2</span></td>\n<td><span class="purple">1</span> </td>\n</tr>\n<tr>\n<th scope="row">Distance from Sun:</th>\n<td><span class="orange">227,943,824 k