In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from splinter import Browser
import time

import json
import tweepy
import apikeys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# since beautifulsoup cannot retrieve javascript parts of the page, selenium
# is being used to retrieve the rendered html file.
url = 'https://mars.nasa.gov/news/'
browser = webdriver.Chrome('chromedriver.exe')

# Retrieve page with the 'get' function from the browser object.  Then using
# 'page_source function to get html text

browser.get(url)
html = browser.page_source

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html, 'html.parser')
browser.close()

### Selenium module was used to pull the html file from the nasa website.  Reason for this was because the site had articles that was being uploaded using javascript.  Hence beautiful soup could only read about half of the 'li' tags from the article.  Even after 'prettify' was called, html text only showed 6-7 content titles, where there should have been 15.

In [2]:
# print(soup.body.prettify())

In [3]:
# THIS LINE OF CODE FINDS THE BOTTOM 'SLIDE AND SLIDE' ARTICLES WHICH
# ARE DUPLICATES FROM THE TOP PORTION

# soup_title = soup.find_all("div", class_="content_title")
# for result in soup_title:
#     print(result.text)

In [4]:
# THIS LINE OF CODE FINDS THE BOTTOM 'SLIDE AND SLIDE' ARTICLES WHICH
# ARE DUPLICATES FROM THE TOP PORTION

# soup_teaser = soup.find_all("div", class_="rollover_description_inner")
# for each in soup_teaser:
#     print(each.text)

In [5]:
soup_li = soup.find_all('li', class_='slide')
#len(soup_li)

In [6]:
list_of_titles = []
list_of_paragraphs = []

for eachslide in soup_li:
    one_title = eachslide.find('div', class_='content_title').text
    one_paragraph = eachslide.find('div', class_='article_teaser_body').text
#     print("TITLE:  " + one_title)
#     print("PARAGRAPH:  " + one_paragraph)
#     print()
    list_of_titles.append(one_title)
    list_of_paragraphs.append(one_paragraph)

In [7]:
news_title = list_of_titles[0]
news_p = list_of_paragraphs[0]

print(news_title)
print(news_p)

Mount Sharp 'Photobombs' Mars Curiosity Rover
A new self-portrait of NASA's Curiosity Mars rover shows the vehicle on Vera Rubin Ridge.


### Using splinter to pull click through the website to find the 'featured' image.  Involves clicking 2 times, where a timer was used because the page execution time was slower than that of the program function.  Hence an error would pop up when there is a lag between the new page opening and the program execution.  Timer is set to 2 second delay before the next line of code.

In [8]:

splint_browser = Browser('chrome', executable_path='chromedriver.exe',
                  headless=False)

url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
splint_browser.visit(url)

splint_browser.click_link_by_partial_text('FULL IMAGE')
time.sleep(2)
splint_browser.click_link_by_partial_text('more info')

In [9]:
html = splint_browser.html
soup = bs(html, 'html.parser')
splint_browser.quit()
#soup.body.prettify()

In [10]:
image_src = soup.find_all('figure', class_='lede')
len(image_src)

1

In [11]:
for each in image_src:
    print(each.a['href'])
    featured_image_url = 'https://www.jpl.nasa.gov'+each.a['href']

/spaceimages/images/largesize/PIA19404_hires.jpg


In [12]:
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA19404_hires.jpg


### Pulling the twitter handle and tweet from 'marsweather' twitter account.  Tweet is stored as information on Mars' weather patterns.

In [13]:
# Twitter API Keys
consumer_key = apikeys.TWITTER_CONSUMER_KEY
consumer_secret = apikeys.TWITTER_CONSUMER_SECRET
access_token = apikeys.TWITTER_ACCESS_TOKEN
access_token_secret = apikeys.TWITTER_ACCESS_TOKEN_SECRET

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())


In [14]:
public_tweets = api.user_timeline('marswxreport', count = 3)
#print(json.dumps(public_tweets, sort_keys=True, indent=4))
mars_weather = ''

In [15]:
for tweet in public_tweets:
    if (("hPa" in tweet['text']) and ("Sol" in tweet['text'])):
        mars_weather = tweet['text']
        break
        
print(mars_weather)

Sol 1949 (Jan 29, 2018), Sunny, high -20C/-4F, low -78C/-108F, pressure at 7.51 hPa, daylight 05:42-17:28


### Building the table - from https://space-facts.com/mars/, table is pulled from the website and converted to a dataframe, where the columns are renamed, and then converted back to an html coded variable.

In [31]:
url_tables = 'https://space-facts.com/mars/'

tables = pd.read_html(url_tables)

for each in tables:
    print(each)
    print("Tablelength: " + str(len(tables)))

                      0                              1
0  Equatorial Diameter:                       6,792 km
1       Polar Diameter:                       6,752 km
2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
3                Moons:            2 (Phobos & Deimos)
4       Orbit Distance:       227,943,824 km (1.52 AU)
5         Orbit Period:           687 days (1.9 years)
6  Surface Temperature:                  -153 to 20 °C
7         First Record:              2nd millennium BC
8          Recorded By:           Egyptian astronomers
Tablelength: 1


In [33]:
table_df = pd.DataFrame(tables[0])
table_df = table_df.rename(columns={0:"planet_profile", 1:"mars_data"})
table_df = table_df.set_index('planet_profile')
table_df.head()

Unnamed: 0_level_0,mars_data
planet_profile,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"


In [34]:
table_html = pd.DataFrame.to_html(table_df)
table_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>mars_data</th>\n    </tr>\n    <tr>\n      <th>planet_profile</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n  

### Finding hemisphere pictures (4)
* Using splinter to click through the list of 4 links on the opening page.  Gathering all of the H3 tags (titles), then storing those titles into a list.
* Then used splinter again to click to the URL where the image is stored.  This involved using splinter twice to click to the large image file.  Which also consisted of a popup page which needed to be stored as the 'current' page.
* Image_url were stored in a seperate list, which was then zipped into a list of tuples, which was then used to create the list of dictionaries.

In [19]:
hemispheres = pd.DataFrame(columns=['title', 'img_url'])
hemispheres

Unnamed: 0,title,img_url


In [20]:
splint_browser = Browser('chrome', executable_path='chromedriver.exe',
                  headless=False)

url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
splint_browser.visit(url)

hemisphere_list = []

html = splint_browser.html
soup = bs(html, 'html.parser')
findHemisphere = soup.find_all('div', class_='item')

for each in findHemisphere:
    print(each.h3.text)
    hemisphere_list.append(each.h3.text)
    
splint_browser.quit()

Cerberus Hemisphere Enhanced
Schiaparelli Hemisphere Enhanced
Syrtis Major Hemisphere Enhanced
Valles Marineris Hemisphere Enhanced


In [24]:
hemisphere_image = []

for eachHemi in hemisphere_list:

    splint_browser = Browser('chrome', executable_path='chromedriver.exe',
                  headless=False)
    splint_browser.visit(url)
    time.sleep(2)
    splint_browser.click_link_by_partial_text(eachHemi)

    time.sleep(2)
    splint_browser.click_link_by_text('Sample')
    #Line 23
    splint_browser.windows.current = splint_browser.windows[1]
    #Line 24
    html = splint_browser.html
    soup = bs(html, 'html.parser')
    splint_browser.quit()
    #Line 25
    hemi_image = soup.body.find('img')['src']
    
    hemisphere_image.append(hemi_image)

In [25]:
for each in hemisphere_image:
    print(each)
    print()

https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg

https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg

https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg

https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg



In [26]:
# print(splint_browser.windows.current)
# print(splint_browser.url)
# splint_browser.windows.current = splint_browser.windows[1]

In [27]:
# html = splint_browser.html
# soup = bs(html, 'html.parser')
# print(soup.body.prettify())
# splint_browser.quit()

In [28]:
# hemi_image = soup.body.find('img')['src']
# print(hemi_image)
# print(type(hemi_image))

In [29]:
# hemi_image = soup.body

# for each in hemi_image:
#     print(each['src'])
#     print(type(each['src']))

In [61]:
title_image_url = []
title_image_tuple = zip(hemisphere_list, hemisphere_image)

In [62]:
for each in title_image_tuple:
    temp_dict = {}
    temp_dict['title'] = each[0]
    temp_dict['img_url'] = each[1]
    title_image_url.append(temp_dict)

In [63]:
for each in title_image_url:
    print(each)

{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}
{'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}
{'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}
{'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}


In [64]:
mars_dict = {'News Title': news_title, 'News Paragraph': news_p, 'Featured Image':
    featured_image_url, 'Mars Weather': mars_weather, 'Mars Info': table_html,
    'Hemisphere Images': title_image_url}

In [65]:
print(mars_dict)

{'News Title': "Mount Sharp 'Photobombs' Mars Curiosity Rover", 'News Paragraph': "A new self-portrait of NASA's Curiosity Mars rover shows the vehicle on Vera Rubin Ridge.", 'Featured Image': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA14712_hires.jpg', 'Mars Weather': 'Sol 1949 (Jan 29, 2018), Sunny, high -20C/-4F, low -78C/-108F, pressure at 7.51 hPa, daylight 05:42-17:28', 'Mars Info': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>planet_profile</th>\n      <th>mars_data</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n   