In [1]:
# import dependencies
import pandas as pd
import numpy as np
from splinter import Browser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
# import below when using Chrome browser
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
import time
import json
import os
# set local chromedriver
chromedrv = os.path.expanduser('~/ucbDA/chromedriver_win32/chromedriver')

## Method 1: splinter's executable_path

In [3]:
# Set some default options for chrome browser
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
# Set up splinter (PS: executable_path has deprecated)
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False, options=options)
# Visit a site defined in the url
def geturl(url):
    browser.visit(url)
    # Optional delay for loading the page (unit: seconds)
    browser.is_element_present_by_css('div.list_text', wait_time=1)

In [4]:
# Visit the Mars NASA news site
geturl('https://redplanetscience.com')

In [5]:
# Parse the HTML and select all news articles
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elems = news_soup.select('div.list_text')

In [6]:
# Find and store all news article titles and preview texts
news_list = []
for elem in slide_elems:
    # Use the parent element to find the news article title
    title = elem.find('div', class_='content_title').text
    # Use the parent element to find the paragraph text
    preview = elem.find('div', class_='article_teaser_body').text
    # Append each key-value pair to a list/dict
    news_list.append({'title': title, 'preview': preview})

In [7]:
# export the Python list/dict into a JSON file
outfile = './Data/mars_data_method1.json'
with open(outfile, 'w', encoding='utf-8') as f:
    json.dump(news_list, f, ensure_ascii=False, indent=4)
# close file (optional)
f.close()

In [8]:
# Verify the json file
infile = open(outfile, 'r', encoding='utf-8')
mars_data = json.load(infile)

In [9]:
browser.quit()

## Method 2: selenium's webdriver

In [10]:
# Set some default options for chrome browser
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
# Set up splinter (selenium 4)
driver = webdriver.Chrome(service=Service(chromedrv), options=options)
# Visit a site defined in the url
def geturl(url):
    driver.get(url)
    # Optional delay for loading the page (unit: seconds)
    driver.implicitly_wait(1)

In [11]:
# Visit the Mars NASA news site
geturl('https://redplanetscience.com')

In [12]:
# Parse the HTML (selenium 4)
html = driver.page_source
news_soup = soup(html, 'html.parser')
slide_elems = news_soup.select('div.list_text')

In [13]:
# Find and store all news article titles and preview texts
news_list = []
for elem in slide_elems:
    # Use the parent element to find the news article title
    title = elem.find('div', class_='content_title').text
    # Use the parent element to find the paragraph text
    preview = elem.find('div', class_='article_teaser_body').text
    # Append each key-value pair to a list/dict
    news_list.append({'title': title, 'preview': preview})

In [14]:
# export the Python list/dict into a JSON file
outfile = './Data/mars_data.json'
with open(outfile, 'w', encoding='utf-8') as f:
    json.dump(news_list, f, ensure_ascii=False, indent=4)
# close file (optional)
f.close()

In [15]:
# Verify the json file
infile = open(outfile, 'r', encoding='utf-8')
mars_data = json.load(infile)
mars_data

[{'title': "NASA Wins 4 Webbys, 4 People's Voice Awards",
  'preview': 'Winners include the JPL-managed "Send Your Name to Mars" campaign, NASA\'s Global Climate Change website and Solar System Interactive.'},
 {'title': 'Heat and Dust Help Launch Martian Water Into Space, Scientists Find',
  'preview': 'Scientists using an instrument aboard NASA’s Mars Atmosphere and Volatile EvolutioN, or MAVEN, spacecraft have discovered that water vapor near the surface of the Red Planet is lofted higher into the atmosphere than anyone expected was possible. '},
 {'title': 'Mars 2020 Unwrapped and Ready for More Testing',
  'preview': "In time-lapse video, bunny-suited engineers remove the inner layer of protective foil on NASA's Mars 2020 rover after it was relocated for testing."},
 {'title': "NASA's Briefcase-Size MarCO Satellite Picks Up Honors",
  'preview': 'The twin spacecraft, the first of their kind to fly into deep space, earn a Laureate from Aviation Week & Space Technology.'},
 {'title'

### Create a MongoDB Database
```
mongoimport --type json -d mars_data -c news_list --drop --jsonArray mars_data.json
```

In [16]:
from pymongo import MongoClient
# Create an instance of MongoClient
mongo = MongoClient(port=27017)
mongo.list_database_names()

['admin', 'config', 'local', 'mars_data', 'my_db1', 'petsitly_marketing']

In [17]:
mars_db = mongo['mars_data']
mars_db.list_collection_names()

['news_list', 'customer_list']

In [19]:
titles = mars_db['news_list'].find().distinct('title')
for title in titles:
    print(title)

8 Martian Postcards to Celebrate Curiosity's Landing Anniversary
Heat and Dust Help Launch Martian Water Into Space, Scientists Find
How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus 
Join NASA for the Launch of the Mars 2020 Perseverance Rover
Mars 2020 Unwrapped and Ready for More Testing
NASA Moves Forward With Campaign to Return Mars Samples to Earth
NASA Readies Perseverance Mars Rover's Earthly Twin 
NASA Wins 4 Webbys, 4 People's Voice Awards
NASA to Broadcast Mars 2020 Perseverance Launch, Prelaunch Activities
NASA's Briefcase-Size MarCO Satellite Picks Up Honors
NASA's Perseverance Rover Attached to Atlas V Rocket
NASA's Perseverance Rover Will Carry First Spacesuit Materials to Mars
Naming a NASA Mars Rover Can Change Your Life
Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth
The Detective Aboard NASA's Perseverance Rover


In [20]:
previews = mars_db['news_list'].find().distinct('preview')
for preview in previews:
    print(preview)

An instrument called SHERLOC will, with the help of its partner WATSON, hunt for signs of ancient life by detecting organic molecules and minerals.
Did you know NASA's next Mars rover has a nearly identical sibling on Earth for testing? Even better, it's about to roll for the first time through a replica Martian landscape.
During this next phase, the program will mature critical technologies and make critical design decisions as well as assess industry partnerships.
In a Q&A, spacesuit designer Amy Ross explains how five samples, including a piece of helmet visor, will be tested aboard the rover, which is targeting a July 30 launch. 
In time-lapse video, bunny-suited engineers remove the inner layer of protective foil on NASA's Mars 2020 rover after it was relocated for testing.
Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.
Like much of the rest of the world, the Mars rover team is pushing f

In [21]:
mongo.close()

In [22]:
driver.quit()