In [27]:
# Import dependencies
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
import pandas as pd
import requests
import pymongo
import os
import time
from config import nasa_url, facts_url, mars_hemisphere_url

In [28]:
# URL of page to be scraped
url = nasa_url

# Retrieve page with the requests module
response = requests.get(url, verify=True)

# Parse HTMl
soup = bs(response.text, 'html.parser')

In [29]:
# Grab latest headline and tagline
headline = soup.find_all('div', class_="content_title")[0].text
print(headline)

# Grab the tagline for the above headline
tagline = soup.find_all('div', class_="rollover_description_inner")[0].text
print(tagline)



NASA's Perseverance Sheds More Light on Jezero Crater's Watery Past



Pictures from NASA’s latest six-wheeler on the Red Planet suggest the area’s history experienced significant flooding events.



In [30]:
# Use Splinter to visit site for featured image - First init Splinter
executable_path = {'executable_path': ChromeDriverManager().install()} # Ensure latest version is installed
browser = Browser('chrome', **executable_path, headless=False) # Initialize browser

# Visit URL
ft_img_grab_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(ft_img_grab_url)

# Open featured image full res destination
browser.links.find_by_partial_text('FULL IMAGE').click()
time.sleep(6)

# Parse scraped HTML
scraped_html = browser.html
img_parse = bs(scraped_html, 'html.parser')

# Grab full res image
featured_image_scrape = img_parse.find('img', class_="fancybox-image")
featured_image_url = featured_image_scrape.attrs['src']

# Get full link
featured_img_final = f"https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{featured_image_url}"
featured_img_final



Current google-chrome version is 95.0.4638
Get LATEST driver version for 95.0.4638
Driver [/Users/josephchancey/.wdm/drivers/chromedriver/mac64/95.0.4638.54/chromedriver] found in cache


'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg'

In [31]:
# Closer Splinter browser
browser.quit()

In [32]:
# Scrape table with Pandas
table_url = "https://space-facts.com/mars/"

# read_html to automatically scrape every table from page
tables = pd.read_html(requests.get(table_url).text)
tables
# Assign to dataframe to view data - [0] index is the table we wanted
df = tables[0]
df.head()

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"


In [33]:
# Clean DataFrame
# Rename column
df = df.rename(columns={1: "Values", 0:"Query"})
# Set New Index
df = df.set_index('Query')
df

Unnamed: 0_level_0,Values
Query,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [39]:
# Convert to HTML so we can plug it right into the website
df_html = df.to_html()

In [40]:
# NEED TO CHANGE NAMING SCHEME AND ADD COMMENTS

executable_path = {'executable_path': ChromeDriverManager().install()} # Ensure latest version is installed
browser = Browser('chrome', **executable_path, headless=False) # Initialize browser

browser.visit(mars_hemisphere_url)

time.sleep(4)

# Assign the HTML content of the page to a variable
hemisphere_html = browser.html
# Parse HTML with Beautifulsoup
soup = bs(hemisphere_html,'html.parser')

# Collect the urls for the hemisphere images
items = soup.find_all("div", class_="item")

main_url = "https://astrogeology.usgs.gov"
hemisphere_urls = []

for item in items:
    hemisphere_urls.append(f"{main_url}{item.find('a', class_='itemLink')['href']}")

print(*hemisphere_urls, sep = "\n")

# Create a list to store the data
hemisphere_image_urls=[]

# Loop through each url
for url in hemisphere_urls:
    # Navigate to the page
    browser.visit(url)
    
    time.sleep(4)
    
    # Assign the HTML content of the page to a variable
    hemisphere_html = browser.html
    # Parse HTML with Beautifulsoup
    soup = bs(hemisphere_html,'html.parser')
    
    img_url = soup.find('img', class_="wide-image")['src']
    title = soup.find('h2', class_="title").text
    
    hemisphere_image_urls.append({"title":title,"img_url":f"https://astrogeology.usgs.gov{img_url}"})

hemisphere_image_urls



Current google-chrome version is 95.0.4638
Get LATEST driver version for 95.0.4638
Driver [/Users/josephchancey/.wdm/drivers/chromedriver/mac64/95.0.4638.54/chromedriver] found in cache


https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced


[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [36]:
# DEPRICATED - USE CELL ABOVE - 


# Use Splinter to visit site for featured image - First init Splinter
executable_path = {'executable_path': ChromeDriverManager().install()} # Ensure latest version is installed
browser = Browser('chrome', **executable_path, headless=False) # Initialize browser
# Get hemisphere data 
# Use splinter to visit site 
browser.visit(mars_hemisphere_url)
# Gather HTML
hemi_html = browser.html
# Parse HTML
hemi_soup = bs(hemi_html, 'html.parser')
# Retreive all relevant items
hemi_items = hemi_soup.find_all('div', class_='item')
# Create empty list variable to fill with data in upcoming loop
hemi_img_urls = []
# Main Website URL - For later appending
hemi_main_url = 'https://astrogeology.usgs.gov'


# Loop through the items previously stored
for hemi in hemi_items: 
    # Grab title in this iteration
    title = hemi.find('h3').text
    # Grab link for full image path
    temp_img_url = hemi.find('a', class_='itemLink product-item')['href']
    # Path to the full image page
    browser.visit(hemi_main_url + temp_img_url)
    # Rip HTML from page that has full image
    temp_img_html = browser.html
    # Parse that above HTML that was just grabbed 
    temp_soup = bs(temp_img_url, 'html.parser')
    # Grab full image
    img_url = hemi_main_url + temp_soup.find('img', class_='wide-image')['src']
    #### ^^^^ Something is very broken here - returns NoneType error - Mathmatically not good... 
    # Append into a dictionary 
    hemi_img_urls.append({"title" : title, "img_url" : img_url})
    
hemi_img_urls



Current google-chrome version is 95.0.4638
Get LATEST driver version for 95.0.4638
Driver [/Users/josephchancey/.wdm/drivers/chromedriver/mac64/95.0.4638.54/chromedriver] found in cache


TypeError: 'NoneType' object is not subscriptable