## Import Dependencies

In [1]:
import pandas as pd
import requests
import pymongo
import os
import tweepy
import json
import numpy as np

from config import consumer_key, consumer_secret, access_token, access_token_secret
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup as bs
from pprint import pprint

# Setup Tweepy Authentication 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

## Step 1 - Scraping

In [None]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [None]:
db = client.space_db
collection = db.articles

## NASA Mars News

In [None]:
url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(url)

In [None]:
# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

In [None]:
results = soup.find_all("div", class_="list_text")

In [None]:
html = browser.html
soup = bs(html, 'html.parser')

posts = soup.find_all("div", class_="list_text")

title = []
para = []

for post in posts:
    title.append(post.a.text)
    para.append(post.find("div", class_="article_teaser_body").text)

    article ={
        "news_title": title[0],
        "news_text": para[0]
    }

#     news_title = title[0]
#     news_p = para[0]

In [None]:
print(article["news_title"])
print(article["news_text"])

In [None]:
for result in results:
    title = result.a.text
    para = result.find("div", class_="article_teaser_body").text
    
    article = {
        "news_title": title,
        "news_paragraph": para
    }
    
    print("---------------------------------")
    print(f'{title}')
    print(f'{para}')

## Finding the Images URL

In [None]:
url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url2)

In [None]:
# Retrieve page with the requests module
response_image = requests.get(url2)

# Create BeautifulSoup object; parse with 'lxml'
soup_image = bs(response_image.text, 'lxml')

In [None]:
browser.click_link_by_partial_text('FULL IMAGE')

In [None]:
browser.click_link_by_partial_text('more info')

In [None]:
# URL varies depending on what the web browser opens
url_image = "https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA17924"

In [None]:
# Retrieve page with the requests module
response_image1 = requests.get(url_image)

# Create BeautifulSoup object; parse with 'lxml'
soup_image1 = bs(response_image1.text, 'lxml')

In [None]:
results_image = soup_image1.find_all("figure", class_="lede")

In [None]:
# Scarping the image url
image_url = []
for image in results_image:
    image_url.append(image.a["href"])

In [None]:
image_url

In [None]:
# Adding the url into images
for url in image_url:
    featured_image_url= "https://www.jpl.nasa.gov"+url
    print(featured_image_url)

## Mars Weather

In [2]:
target_user = "@MarsWxReport"

In [3]:
public_tweets = api.user_timeline(target_user)

TweepError: [{'code': 32, 'message': 'Could not authenticate you.'}]

In [None]:
weather_tweet = []

for tweet in public_tweets:
    weather_tweet.append(tweet["text"])

In [None]:
mars_weather = weather_tweet[0]
print(mars_weather)

## Mars Facts

In [None]:
url_facts = "https://space-facts.com/mars/"

In [None]:
tables = pd.read_html(url_facts)
tables

In [None]:
df = tables[0]

In [None]:
df_clean = df.set_index(0)
df_clean.index.name=None

In [None]:
df_clean = df_clean.rename(columns={1: "Values"})
df_clean

In [None]:
html_table = df_clean.to_html()

In [None]:
html_table.replace('\n', '')

In [None]:
df_clean.to_html('mars_facts.html')

## Mars Hemisphere

In [None]:
url_cerberus = "https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced"
url_schiaparelli = "https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced"
url_syrtis = "https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced"
url_valles = "https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced"

In [None]:
url_hemisphere = [url_cerberus, url_schiaparelli, url_syrtis, url_valles]

for url in url_hemisphere:
    response_hemisphere = requests.get(url)
    soup_hemisphere = bs(response_hemisphere.text, 'lxml')
    
    results_hemisphere = soup_hemisphere.find_all("div", class_="container")
    
    for result in results_hemisphere:
        title = result.h2.text
        img_url = result.find("img", class_="wide-image")["src"]
        
        hemisphere_image_urls = {
            "title": title,
            "img_url": "https://astrogeology.usgs.gov"+img_url
        }
        
        print(hemisphere_image_urls)

In [None]:
print(hemisphere_image_urls["img_url"])