In [79]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist

In [6]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [7]:
# Define database and collection
db = client.mars_db
collection = db.articles

# Nasa Mars News

In [42]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [39]:
# Retrieve the parent divs for all articles
news_title = soup.find('div', class_='content_title').text.strip()
news_title

"NASA's Mars 2020 Rover Closer to Getting Its Name"

In [76]:
news_p = soup.find('div', class_='rollover_description_inner').text.strip()
news_p

"155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July."

# JPL Mars Space Images - Featured Image

In [87]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [97]:
# URL of page to be scraped
url_img = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

browser.visit(url_img)

In [112]:
try:
    browser.select('category', 'featured')
    bg_attribute_string =browser.find_by_css('article').first['style']
    bg_attribute_string_left = bg_attribute_string.split('("',1)[1]
    bg_attribute_string_final = bg_attribute_string_left.split('")',1)[0]
    
except ElementDoesNotExist:
    print("Clicking Complete")


bg_attribute_string_final

'/spaceimages/images/wallpaper/PIA16711-1920x1200.jpg'

In [113]:
featured_image_url = 'https://www.jpl.nasa.gov' + bg_attribute_string_final
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA16711-1920x1200.jpg'

# Mars Weather

In [99]:
# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup_img = BeautifulSoup(response.text, 'lxml')

In [None]:
soup_img.find('article','carousel_item')

In [85]:
try:
    browser.select('category', 'featured')
    
except ElementDoesNotExist:
    print("Clicking Complete")

WebDriverException: Message: chrome not reachable
  (Session info: chrome=79.0.3945.130)


In [5]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='slide')

# Loop through results to retrieve article title, header, and timestamp of article
for result in results:
    title = result.find('div', class_='content-title').text

    lede = result.find('h5', class_='mixed-feed__subheader').text

    # The time and date of article publication
    date = result.find('time')['datetime']
    # Slice the datetime string for the date
    article_date = date[:10]
    # Slice the datetime string for the time
    time = date[11:16]
    # Determine whether article was published in AM or PM
    if (int(time[:2]) >= 13):
        meridiem = 'pm'
    else:
        meridiem = 'am'

    # Concatenate time string
    time = time + meridiem
    print('-----------------')
    print(title)
    print(lede)
    print(article_date)
    print(time)

    # Dictionary to be inserted into MongoDB
    post = {
        'title': title,
        'lede': lede,
        'date': article_date,
        'time published': time
    }

    # Insert dictionary into MongoDB as a document
    collection.insert_one(post)

-----------------
Lady Byng Trophy finalists unveiled
Barkov, O'Reilly, William Karlsson up for award given for combination of skill, gentlemanly play
2018-04-20
19:00pm
-----------------
Peters resigns as Hurricanes coach
Went 137-138-53 in four seasons with Carolina
2018-04-20
12:00am
-----------------
Bergeron skates, could return for Bruins in Game 5 against Maple Leafs
Boston one win from advancing to second round
2018-04-20
14:33pm
-----------------
Friday Four: Golden Knights continue fairy-tale season
NHL Network analyst Weekes also discusses Sharks, Capitals-Blue Jackets series, Subban
2018-04-20
14:01pm
-----------------
Fantasy: Daily primer for 2018 Stanley Cup Playoffs
Kadri back from suspension in Game 5; Bergeron could return; Wennberg skating
2018-04-20
19:05pm
-----------------
Sharks expect to stick with same lineup in second round
Coach says he'd be 'idiot' to make changes from group that swept Ducks
2018-04-20
17:20pm
-----------------
Doughty open to signing extens

In [6]:
# Display the MongoDB records created above
articles = db.articles.find()
for article in articles:
    print(article)

{'_id': ObjectId('5ada7a1bee61f93d3f7e3d07'), 'title': 'Lady Byng Trophy finalists unveiled', 'lede': "Barkov, O'Reilly, William Karlsson up for award given for combination of skill, gentlemanly play", 'date': '2018-04-20', 'time published': '19:00pm'}
{'_id': ObjectId('5ada7a1bee61f93d3f7e3d08'), 'title': 'Peters resigns as Hurricanes coach', 'lede': 'Went 137-138-53 in four seasons with Carolina', 'date': '2018-04-20', 'time published': '12:00am'}
{'_id': ObjectId('5ada7a1bee61f93d3f7e3d09'), 'title': 'Bergeron skates, could return for Bruins in Game 5 against Maple Leafs', 'lede': 'Boston one win from advancing to second round', 'date': '2018-04-20', 'time published': '14:33pm'}
{'_id': ObjectId('5ada7a1bee61f93d3f7e3d0a'), 'title': 'Friday Four: Golden Knights continue fairy-tale season', 'lede': 'NHL Network analyst Weekes also discusses Sharks, Capitals-Blue Jackets series, Subban', 'date': '2018-04-20', 'time published': '14:01pm'}
{'_id': ObjectId('5ada7a1bee61f93d3f7e3d0b'), '