In [4]:
#Import Dependencies

from bs4 import BeautifulSoup
import requests
import pymongo
from splinter import Browser
import pandas 
from webdriver_manager.chrome import ChromeDriverManager

# Imports for routes
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo
import scrape_mars

from selenium import webdriver

In [18]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [19]:
db = client.mars_db
collection = db.items

In [20]:
# Sets a path to Google Chrome

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/joescuteri/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


In [21]:
# URL of the page to be scraped

url = 'https://redplanetscience.com/'

In [22]:
# Visit the browser

browser.visit(url)

In [23]:
# Convert the browser to HTML

html = browser.html

In [24]:
# Create BeautifulSoup object; parse with 'lxml' or 'html.parser'. This takes your browser and breaks it up into digestable pieces

soup = BeautifulSoup(html, 'html.parser')

In [25]:
# Make it pretty and easy to read

# print(soup.prettify())

In [26]:
# Examine the results, then determine the element that contains the sought info. .text helps just show the item you want.

news_title = soup.find('div', class_='content_title').text

news_p = soup.find('div', class_='article_teaser_body').text

In [27]:
#This is the text of the news title

news_title

"NASA's Perseverance Mars Rover Gets Balanced"

In [28]:
# This is the text of the news teaser

news_p

"The mission team performed a crucial weight-balancing test on the rover in preparation for this summer's history-making launch to the Red Planet."

In [29]:
# New url to scrape images

url_image = 'https://spaceimages-mars.com/'

In [30]:
# connect to the URL the same way you did above

browser.visit(url_image)
html_image = browser.html
soup_image = BeautifulSoup(html_image, 'html.parser')

In [31]:
# This is the item you want to scrape. We did not click in the browser to show just the image. Instead, we used the raw link

space_image = soup_image.find('img', class_='headerimage fade-in').get('src')

In [32]:
# this is the 2nd half of the URL that leads to only the image

space_image

'image/featured/mars1.jpg'

In [33]:
# final URL

total_image = url_image + space_image
total_image

'https://spaceimages-mars.com/image/featured/mars1.jpg'

In [34]:
# This is the url to the Mars Table
# We use pandas to read the table and take the 0 index position

mars_table=pandas.read_html('https://galaxyfacts-mars.com/')[0]

In [35]:
#This is the Mars that we pulled from the website

mars_table

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [36]:
# Convert the Mars table to html and save it as marstable.html

mars_table.to_html('marstable.html')

In [37]:
# Cerberus Title & Image

# Cerberus breakdown
mars_hemispheres = 'https://marshemispheres.com/'
browser.visit(mars_hemispheres)
cerberus_page = browser.links.find_by_partial_text('Cerberus').click()
cerberus_html = browser.html
soup_cerberus = BeautifulSoup(cerberus_html, 'html.parser')

# Cerberus Title
cerberus_title = soup_cerberus.find('h2', class_='title').text
cerberus_hemisphere = cerberus_title.split(' E')[0]
cerberus_hemisphere

# Cerberus Image

cerberus_image = soup_cerberus.find('img', class_='wide-image').get('src')
final_cerberus_image = mars_hemispheres + cerberus_image
final_cerberus_image

'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'

In [38]:
# Schiaparelli Title & Image

# Schiaparelli breakdown
mars_hemispheres = 'https://marshemispheres.com/'
browser.visit(mars_hemispheres)
schiaparelli_page = browser.links.find_by_partial_text('Schiaparelli').click()
schiaparelli_html = browser.html
soup_schiaparelli = BeautifulSoup(schiaparelli_html, 'html.parser')

# Schiaparelli Title
schiaparelli_title = soup_schiaparelli.find('h2', class_='title').text
schiaparelli_hemisphere = schiaparelli_title.split(' E')[0]
schiaparelli_hemisphere

# Schiaparelli Image

schiaparelli_image = soup_schiaparelli.find('img', class_='wide-image').get('src')
schiaparelli_image
final_schiaparelli_image = mars_hemispheres + schiaparelli_image
final_schiaparelli_image

'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'

In [39]:
# Syrtis Title & Image

# Syrtis breakdown
mars_hemispheres = 'https://marshemispheres.com/'
browser.visit(mars_hemispheres)
syrtis_page = browser.links.find_by_partial_text('Syrtis').click()
syrtis_html = browser.html
soup_syrtis = BeautifulSoup(syrtis_html, 'html.parser')

# Syrtis Title
syrtis_title = soup_syrtis.find('h2', class_='title').text
syrtis_hemisphere = syrtis_title.split(' E')[0]
syrtis_hemisphere

# Syrtis Image

syrtis_image = soup_syrtis.find('img', class_='wide-image').get('src')
final_syrtis_image = mars_hemispheres + syrtis_image
final_syrtis_image

'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'

In [40]:
# Valles Title & Image

# Valles breakdown
mars_hemispheres = 'https://marshemispheres.com/'
browser.visit(mars_hemispheres)
valles_page = browser.links.find_by_partial_text('Valles').click()
valles_html = browser.html
soup_valles = BeautifulSoup(valles_html, 'html.parser')

# Valles Title
valles_title = soup_valles.find('h2', class_='title').text
valles_hemisphere = valles_title.split(' E')[0]
valles_hemisphere

# Valles Image

valles_image = soup_valles.find('img', class_='wide-image').get('src')
final_valles_image = mars_hemispheres + valles_image
final_valles_image

'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'

In [41]:
# Create dictionaries

red_planet = {
    'News Title': news_title,
    'News Teaser' : news_p,
    'Space Image' : total_image,
    'Title Cerberus': cerberus_hemisphere, 
    'Cerberus URL': final_cerberus_image,
    'Title Schiaparelli': schiaparelli_hemisphere, 
    'Schiaparelli URL': final_schiaparelli_image,
    'Title Syrtis': syrtis_hemisphere, 
    'Syrtis URL': final_syrtis_image,
    'Title Valles': valles_hemisphere, 
    'Valles URL': final_valles_image
    }


In [42]:
red_planet

{'News Title': "NASA's Perseverance Mars Rover Gets Balanced",
 'News Teaser': "The mission team performed a crucial weight-balancing test on the rover in preparation for this summer's history-making launch to the Red Planet.",
 'Space Image': 'https://spaceimages-mars.com/image/featured/mars1.jpg',
 'Title Cerberus': 'Cerberus Hemisphere',
 'Cerberus URL': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
 'Title Schiaparelli': 'Schiaparelli Hemisphere',
 'Schiaparelli URL': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
 'Title Syrtis': 'Syrtis Major Hemisphere',
 'Syrtis URL': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
 'Title Valles': 'Valles Marineris Hemisphere',
 'Valles URL': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}

In [43]:
collection.insert_one(red_planet)

<pymongo.results.InsertOneResult at 0x7fa88ca1f280>

In [44]:
app = Flask(__name__)

app.config["MONGO_URI"] = "mongodb://localhost:27017/mars_db"
mongo = PyMongo(app)

In [None]:
@app.route("/")
def index():
    marspage = mongo.db.red_planet.find_one()
    return render_template("index.html", red_planet=marspage)


@app.route("/scrape")
def scraper():
    red_planet = mongo.db.red_planet
    mars_data = scrape_mars.scrape()
    red_planet.update({}, mars_data, upsert=True)
    print(red_planet)
    return redirect("/", code=302)


if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [11/Dec/2021 18:49:21] "[37mGET / HTTP/1.1[0m" 200 -


Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/joescuteri/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/joescuteri/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
127.0.0.1 - - [11/Dec/2021 18:49:38] "[35m[1mGET /scrape HTTP/1.1[0m" 500 -
127.0.0.1 - - [11/Dec/2021 18:49:42] "[35m[1mGET /scrape HTTP/1.1[0m" 500 -
Traceback (most recent call last):
  File "/opt/anaconda3/envs/PythonData/lib/python3.8/site-packages/flask/app.py", line 2464, in __call__
    return self.wsgi_app(environ, start_response)
  File "/opt/anaconda3/envs/PythonData/lib/python3.8/site-packages/flask/app.py", line 2450, in wsgi_app
    resp

In [None]:
# #Initialize PyMongo to work with MongoDBs

# conn = 'mongodb://localhost:27017'
# client = pymongo.MongoClient(conn)

In [None]:
#Define database and collection

# db = client.mars_db
# collection = db.mars_articles