# Deliverable 1: web scraping the Mars News
- Scrape **titles** and **preview texts** from Mars news articles.
- Optionally export the data into a JSON file or a MongoDB database. 

In [1]:
# import dependencies
import pandas as pd
import numpy as np
from splinter import Browser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
# import below when using Chrome browser
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
import time
import json
import os
# set local chromedriver
chromedrv = os.path.expanduser('C:\chromedriver.exe')

## Method 1: splinter's executable_path

In [3]:
# Set some default options for chrome browser
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
# Set up splinter (PS: executable_path has deprecated)
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False, options=options)
# Visit a site defined in the url
def geturl(url):
    browser.visit(url)
    # Optional delay for loading the page (unit: seconds)
    browser.is_element_present_by_css('div.list_text', wait_time=1)

In [4]:
# Visit the Mars NASA news site
geturl('https://redplanetscience.com')

In [5]:
# Parse the HTML and select all news articles
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elems = news_soup.select('div.list_text')

In [6]:
# Find and store all news article titles and preview texts
news_list = []
for elem in slide_elems:
    # Use the parent element to find the news article title
    title = elem.find('div', class_='content_title').text
    # Use the parent element to find the paragraph text
    preview = elem.find('div', class_='article_teaser_body').text
    # Append each key-value pair to a list/dict
    news_list.append({'title': title, 'preview': preview})

In [7]:
# export the Python list/dict into a JSON file
outfile = './Data/mars_data_method1.json'
with open(outfile, 'w', encoding='utf-8') as f:
    json.dump(news_list, f, ensure_ascii=False, indent=4)
# close file (optional)
f.close()

In [8]:
# Verify the json file
infile = open(outfile, 'r', encoding='utf-8')
mars_data = json.load(infile)

In [9]:
browser.quit()

## Method 2: selenium's webdriver

In [10]:
# Set some default options for chrome browser
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
# Set up splinter (selenium 4)
driver = webdriver.Chrome(service=Service(chromedrv), options=options)
# Visit a site defined in the url
def geturl(url):
    driver.get(url)
    # Optional delay for loading the page (unit: seconds)
    driver.implicitly_wait(1)

In [11]:
# Visit the Mars NASA news site
geturl('https://redplanetscience.com')

In [12]:
# Parse the HTML (selenium 4)
html = driver.page_source
news_soup = soup(html, 'html.parser')
slide_elems = news_soup.select('div.list_text')

In [13]:
# Find and store all news article titles and preview texts
news_list = []
for elem in slide_elems:
    # Use the parent element to find the news article title
    title = elem.find('div', class_='content_title').text
    # Use the parent element to find the paragraph text
    preview = elem.find('div', class_='article_teaser_body').text
    # Append each key-value pair to a list/dict
    news_list.append({'title': title, 'preview': preview})

In [14]:
# export the Python list/dict into a JSON file
outfile = './Data/mars_data.json'
with open(outfile, 'w', encoding='utf-8') as f:
    json.dump(news_list, f, ensure_ascii=False, indent=4)
# close file (optional)
f.close()

In [15]:
# Verify the json file
infile = open(outfile, 'r', encoding='utf-8')
mars_data = json.load(infile)
mars_data

[{'title': "Virginia Middle School Student Earns Honor of Naming NASA's Next Mars Rover",
  'preview': 'NASA chose a seventh-grader from Virginia as winner of the agency\'s "Name the Rover" essay contest. Alexander Mather\'s entry for "Perseverance" was voted tops among 28,000 entries. '},
 {'title': 'Naming a NASA Mars Rover Can Change Your Life',
  'preview': 'Want to name the robotic scientist NASA is sending to Mars in 2020? The student who named Curiosity — the rover currently exploring Mars — will tell you this is an opportunity worth taking.'},
 {'title': "3 Things We've Learned From NASA's Mars InSight ",
  'preview': 'Scientists are finding new mysteries since the geophysics mission landed two years ago.'},
 {'title': "NASA Wins 4 Webbys, 4 People's Voice Awards",
  'preview': 'Winners include the JPL-managed "Send Your Name to Mars" campaign, NASA\'s Global Climate Change website and Solar System Interactive.'},
 {'title': "NASA's Briefcase-Size MarCO Satellite Picks Up Honor

### Create a MongoDB Database
```
mongoimport --type json -d mars_data -c news_list --drop --jsonArray mars_data.json
```

In [16]:
from pymongo import MongoClient
# Create an instance of MongoClient
mongo = MongoClient(port=27017)
mongo.list_database_names()

['admin', 'config', 'local', 'team_db']

In [17]:
mars_db = mongo['mars_data']
mars_db.list_collection_names()

[]

In [18]:
titles = mars_db['news_list'].find().distinct('title')
for title in titles:
    print(title)

In [19]:
previews = mars_db['news_list'].find().distinct('preview')
for preview in previews:
    print(preview)

In [20]:
mongo.close()

In [21]:
driver.quit()