In [None]:
import pandas as pd
import pymongo
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import time

In [None]:
#Establish db connection
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

In [None]:
# Define the Mongo database
# Declare the db
db = client.mars_db

# Declare the collection
collection = db.mars

In [None]:
#Define executable path and initialize the browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

## NASA Mars News Scraping

[Mars News Site](https://redplanetscience.com/)

In [None]:
# Initialize the browser
url1 = 'https://redplanetscience.com/'
browser.visit(url1)

In [None]:
#Parse into Beautiful Soup object
news_html = browser.html
soup = BeautifulSoup(news_html, "html.parser")

In [None]:
# Iterate through all cards of the news in the bootstrap code
# Retrieve all elements that contain News Titles and Paragraph Text
articles = soup.find_all("div", class_ = "list_text")
   
# Iterate through each card and retrieve headline and paragraph
for article in articles:
       
    # Use BeautifulSoup's find() method to navigate and retrieve attributes
    news_title = article.find("div", class_ = "content_title").text
    news_body = article.find("div", class_ = "article_teaser_body").text
    
    print("------------------------------------------")
    print(f"Headline:  {news_title}")
    print(f"Content:  {news_body}")
   
    # Dictionary to be inserted as a MongoDB document
    post = {"Headline": news_title, 
            "Content": news_body,
           }
           
    collection.insert_one(post)


In [None]:
# Verify database results 
results = collection.find()
for result in results:
    print(result)

In [None]:
# Close remote browser
# browser.quit()

## JPL Mars Space Images Scraping

[Featured Space Image site](https://spaceimages-mars.com)

In [None]:
#Define executable path and initialize the browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Initialize the browser
url2 = 'https://spaceimages-mars.com/'
browser.visit(url2)

In [None]:
#Parse into Beautiful Soup object
image_html = browser.html
soup = BeautifulSoup(image_html, "html.parser")

In [None]:
# Find the image url
image_path = soup.find("img", class_ = "headerimage")["src"]
print(image_path)

In [None]:
featured_img_url = "https://spaceimages-mars.com/"+image_path
print(featured_img_url)

In [None]:
# Close remote browser
# browser.quit()

## Mars Facts

[Mars Facts site](https://galaxyfacts-mars.com)

In [None]:
#Scrape the table using pandas
url3 = "https://galaxyfacts-mars.com/"
    
tables = pd.read_html(url3)

tables

In [None]:
# First Table 
df1 = tables[0]
df1.head()

In [None]:
# Drop the Earth column
df1 = df1.drop([2], axis=1)
df1.head()

In [None]:
# Rename headers
df1 = df1.rename(columns = {0:'Fact', 1:'Data'})
df1.head()

In [None]:
# Drop the first row
df1[df1.Fact != "Mars - Earth Comparison"]


In [None]:
# Second Table 
df2 = tables[1]
df2.head()

In [None]:
# Rename headers
df2 = df2.rename(columns = {0:'Fact', 1:'Data'})
df2.head()

In [None]:
facts_df = df1.append(df2, ignore_index=True, sort=False)

facts_df

In [None]:
# Parse to an html string
fact_table = facts_df.to_html()
fact_table

In [None]:
# Close remote browser
# browser.quit()

## Mars Hemispheres

[Mars Hemispheres site](https://marshemispheres.com/)

In [None]:
# Initialize the browser
url4 = 'https://marshemispheres.com/'
browser.visit(url4)

In [None]:
# Parse into Beautiful Soup object
hemi_html = browser.html
soup = BeautifulSoup(hemi_html, "html.parser")

In [None]:
# Save image url string and hemisphere title to a Python dictionary
hemisphere_img_urls = [{"title":"Cerberus Hemisphere","img_url": "https://marshemispheres.com/images/cerberus_enhanced.tif"},
                       {"title":"Schiaparelli Hemisphere","img_url": "https://marshemispheres.com/images/schiaparelli_enhanced.tif"},
                       {"title":"Syrtis Major Hemisphere","img_url": "https://marshemispheres.com/images/syrtis_major_enhanced.tif"},
                       {"title":"Valles Marineris Hemisphere","img_url": "https://marshemispheres.com/images/valles_marineris_enhanced.tif"},
                      ]

In [None]:
hemisphere_img_urls

In [None]:
# Can this be automated in a loop?
# Initialize the browser
url5 = "https://marshemispheres.com/"
browser.visit(url5)

In [None]:
# Parse into Beautiful Soup object
hemi2_html = browser.html
soup = BeautifulSoup(hemi2_html, "html.parser")

In [None]:
# Retrieve all elements that contain image URLs
links = hemispheresSoup.find_all("div", class_ = "description")

# Create empty list hold dictionaries
hemisphere_image_urls = []

# Iterate through
for link in links:
        
    # Use BeautifulSoup's find() method to navigate and retrieve attributes
    img_title = link.find("h3").text
    img_link = link.find("a", class_ = "itemLink product-item")["href"]
    
    # Find the link with the full res image
    browser.visit(url5 + img_link)
    
    # HTML object
    link = browser.html
    
    # Parse HTML with BeautifulSoup
    linksoup = BeautifulSoup(link, "html.parser")
    
    # Full resolution image URL
    url = url5 + linksoup.find("img", class_ = "wide-image")["src"]
    
    # Append to list of dictionaries
    hemisphere_image_urls.append({"title":img_title, "img_url":url})
    
    # Display titles and links
    print("------------------------------------------")
    print(f"Title:  {img_title}")
    print(f"Link:   {url}")

# Display end statement    
print(" ")
print("Scraping Complete.")

# Close remote browser
browser.quit()