Project: Web Scraping Challenge  
Name: Jodi Heen  
Goal: Using Beautiful Soup, scrape a website for desired content, load results into a Mongo Database, and show active results using a Flask App dashboard.

In [1]:
# Import Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import pandas as pd

In [2]:
# activate chromedriver
!which chromedriver

/usr/local/bin/chromedriver


In [3]:
# Establish the path and browser connection
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# Initial url to visit
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

In [5]:
browser.visit(url)

In [6]:
# Fetch the first headline and paragraph visible below it
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

list_item = soup.select_one('ul.item_list li.slide')

news_title = browser.find_by_tag('div[class="content_title"]')
teaser = browser.find_by_tag('div[class="article_teaser_body"]')
print(news_title.text)
print(teaser.text)


Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.


In [7]:
# Featured Images
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [8]:
# Initial URL
browser.visit(image_url)

In [9]:
# Clicking "Full Image" button
footer = browser.find_by_tag('footer')
within_footer = footer.find_by_tag('a').click()

In [10]:
# Clicking "more info" button
more_info = browser.find_by_id('fancybox-lock')
a_tag = more_info.find_by_tag('a[class="button"]').click()

In [11]:
# Clicking the image for full size
div = browser.find_by_id('page')
section = div.find_by_tag('section')
jpg_image = section.find_by_tag('img').click()

In [12]:
# Print browser url to check it
browser.url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17652_hires.jpg'

In [13]:
# Save the featured image url
featured_jpg_img = browser.url

In [14]:
# Print this so that it's available
print(featured_jpg_img)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17652_hires.jpg


In [15]:
# Mars Weather via Twitter
mars_wth = 'https://twitter.com/marswxreport?lang=en'

In [16]:
browser.visit(mars_wth)

In [17]:
r"sol"

'sol'

In [18]:
# something to try https://splinter.readthedocs.io/en/latest/matchers.html
# div = browser.find_by_tag('main')
section = browser.is_element_present_by_tag('span')
# main_content = soup.find('main', attrs = {'class': 'span'})
# print(main_content)
# jpg_image = section.find_by_tag('img').click()


In [19]:
# url = 'https://twitter.com/marswxreport?lang=en'

In [20]:
# browser = webdriver.Chrome()
# browser.get(url)

In [21]:
# main = browser.find_element_by_tag_name('main')

In [22]:
# tweet = browser.find_elements_by_class_name('span')

In [23]:
# print(tweet)

In [24]:
# Mars Facts
mfacts_url = 'https://space-facts.com/mars/'

In [25]:
# Using Pandas, read in the html to a table
mars_tables = pd.read_html(mfacts_url)
mars_tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:    -153 to 20 °C      -88 to 58°C,
           

In [26]:
# Transform table into dataframe
mfacts_df = mars_tables[0]
mfacts_df.columns = ['description', 'value']
mfacts_df

Unnamed: 0,description,value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [27]:
# Mars Hemispheres
h_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [28]:
browser.visit(h_url)

In [29]:
# Caputring the titles (one)
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="item"]')[0]
title_1 = div.find_by_tag('h3').text
print(title_1)

Cerberus Hemisphere Enhanced


In [30]:
# Clicking to the images (one)
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="description"]')[0]
image = div.find_by_tag('a').click()

In [31]:
# Capturing full size image
head = browser.find_by_tag('div[class="widget block bar"]')
ul = browser.find_by_tag('ul')
li = ul.find_by_tag('li')[0]
image1 = li.find_by_tag('a').click()

In [32]:
image1_url = browser.url
print(image1_url)

https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced


In [33]:
# Revisit browser in order to capture correct urls and images
h_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [34]:
browser.visit(h_url)

In [35]:
# Capturing the titles (two)
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="item"]')[1]
title_2 = div.find_by_tag('h3').text
print(title_2)

Schiaparelli Hemisphere Enhanced


In [36]:
# Clicking to the images (two)
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="description"]')[1]
image = div.find_by_tag('a').click()

In [37]:
# Capturing full size image
head = browser.find_by_tag('div[class="widget block bar"]')
ul = browser.find_by_tag('ul')
li = ul.find_by_tag('li')[0]
image2 = li.find_by_tag('a').click()

In [38]:
image2_url = browser.url
print(image2_url)

https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced


In [39]:
# Revisit browser in order to capture correct urls and images
h_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [40]:
browser.visit(h_url)

In [41]:
# Capture third title
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="item"]')[2]
title_3 = div.find_by_tag('h3').text
print(title_3)

Syrtis Major Hemisphere Enhanced


In [42]:
# Clicking to the images (three)
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="description"]')[2]
image = div.find_by_tag('a').click()

In [43]:
# Capturing full size image
head = browser.find_by_tag('div[class="widget block bar"]')
ul = browser.find_by_tag('ul')
li = ul.find_by_tag('li')[0]
image3 = li.find_by_tag('a').click()

In [44]:
image3_url = browser.url
print(image3_url)

https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced


In [45]:
# Revisit browser in order to capture correct urls and images
h_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [46]:
browser.visit(h_url)

In [47]:
# Capture fourth title
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="item"]')[3]
title_4 = div.find_by_tag('h3').text
print(title_4.text)

Valles Marineris Hemisphere Enhanced


In [48]:
# Clicking to the images (four)
head = browser.find_by_tag('div[class="collapsible results"]')
div = browser.find_by_tag('div[class="description"]')[3]
image = div.find_by_tag('a').click()

In [49]:
# Capturing full size image
head = browser.find_by_tag('div[class="widget block bar"]')
ul = browser.find_by_tag('ul')
li = ul.find_by_tag('li')[0]
image1 = li.find_by_tag('a').click()

In [50]:
image4_url = browser.url
print(image4_url)

https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced


In [57]:
# Hemisphere data dictionary
hemis_dict = [
    {"title": title_1, "img_url": image1_url}
    {"title": title_2, "img_url": image2_url},
    {"title": title_3, "img_url": image3_url},
    {"title": title_4, "img_url": image4_url}
]

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=50538): Max retries exceeded with url: /session/d07e7ac07f64b3d169918cb7f31384e7/element/34b7176e-6d96-4a9a-bb14-42962ac6aee9/text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1155f3390>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [56]:
hemis_dict

[{'title': <splinter.element_list.ElementList at 0x114f3f990>,
  'img_url': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'},
 {'title': <splinter.element_list.ElementList at 0x11557bd50>,
  'img_url': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'},
 {'title': <splinter.element_list.ElementList at 0x115585250>,
  'img_url': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'},
 {'title': <splinter.element_list.ElementList at 0x115564810>,
  'img_url': 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'}]

In [58]:
browser.quit()

In [54]:
# from selenium import webdriver
# twitter_url = 'https://twitter.com/marswxreport?lang=en'
# browser.visit(twitter_url)
# driver = webdriver.Chrome()
# driver.get(twitter_url)
# html = driver.page_source

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=50538): Max retries exceeded with url: /session/d07e7ac07f64b3d169918cb7f31384e7/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11556a9d0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
tweet = soup.find('span', {'class':"css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"}) # css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"
print(tweet.text)

In [None]:
twitter_url = 'https://twitter.com/marswxreport?lang=en'
driver = webdriver.Chrome()
driver.get(twitter_url)
html = driver.page_source
driver.close()

In [None]:
hemisphere_image_urls = [
    {"title": {title_1}, "img_url": "..."},
    {"title": "Cerberus Hemisphere", "img_url": "..."},
    {"title": "Schiaparelli Hemisphere", "img_url": "..."},
    {"title": "Syrtis Major Hemisphere", "img_url": "..."},
]