# Web Scraping for Mars Mission.¶

In [1]:
import requests
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import time
import datetime
from pprint import pprint

In [2]:
!which chromedriver

/usr/local/bin/chromedriver


In [3]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
#Create empty dictionary to store all the mars information.
mars_info = dict()

# Part 1. ### Mars News¶

In [5]:
url = 'https://mars.nasa.gov/news/'

In [6]:
browser.visit(url)

In [7]:
time.sleep(3)

In [8]:
html = browser.html

In [9]:
news_soup = BeautifulSoup(html, 'html.parser')

In [10]:
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [11]:
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [12]:
print(slide_elem)

<li class="slide"><div class="image_and_description_container"><a href="/news/8438/nasas-mro-completes-60000-trips-around-mars/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">The orbiting spacecraft is also about to set a record for data relayed from the Martian surface.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="This series of images shows carbon dioxide ice sublimating (going directly from a solid to a gas) inside a pit at Mars' south pole." src="/system/news_items/list_view_images/8438_mro-trim_main-226.jpg"/></div><div class="bottom_gradient"><div><h3>NASA's MRO Completes 60,000 Trips Around Mars</h3></div></div></a><div class="list_text"><div class="list_date">May 15, 2019</div><div class="content_title"><a href="/news/8438/nasas-mro-completes-60000-trips-around-mars/" target="_self">NASA's MRO Completes 60,000 Trips Around Mars</a></div><div class="articl

In [13]:
news_title = slide_elem.find("div", class_='content_title').get_text()
print(news_title)
 

NASA's MRO Completes 60,000 Trips Around Mars


In [14]:
news_para = slide_elem.find("div", class_='article_teaser_body').get_text()
print(news_para)

The orbiting spacecraft is also about to set a record for data relayed from the Martian surface.


In [15]:
# Append results from part 1 into the final mars_info dictionary.
mars_info["Mars_News_Title"] = news_title
mars_info["Mars_News_Body"] = news_para
pprint(mars_info)

{'Mars_News_Body': 'The orbiting spacecraft is also about to set a record for '
                   'data relayed from the Martian surface.',
 'Mars_News_Title': "NASA's MRO Completes 60,000 Trips Around Mars"}


# Part 2. ### JPL Mars Images¶ 

In [16]:
url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [17]:
browser.visit(url2)

In [18]:
full_image_elem = browser.find_by_id('full_image')

In [19]:
full_image_elem.click()

In [20]:
time.sleep(3)

In [21]:
more_info_elem = browser.find_link_by_partial_text('more info')

In [22]:
more_info_elem.click()

In [23]:
time.sleep(3)

In [24]:
html2 = browser.html

In [25]:
image_soup = BeautifulSoup(html2, 'html.parser')

In [26]:
lede1=image_soup.select_one('figure.lede a img')

In [27]:
image_url=lede1.get("src")

In [28]:
print(image_url)

/spaceimages/images/largesize/PIA18905_hires.jpg


In [29]:
full_image_url = "https://www.jpl.nasa.gov" + image_url

In [30]:
print(full_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18905_hires.jpg


In [31]:
#Append full image url to the Mars dictionary.
mars_info["Mars_Full_Image"] = full_image_url

In [32]:
pprint(mars_info)

{'Mars_Full_Image': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18905_hires.jpg',
 'Mars_News_Body': 'The orbiting spacecraft is also about to set a record for '
                   'data relayed from the Martian surface.',
 'Mars_News_Title': "NASA's MRO Completes 60,000 Trips Around Mars"}


# Part 3 . ### Mars Weather tweet¶

In [33]:
url3 = 'https://twitter.com/marswxreport?lang=en'

In [34]:
browser.visit(url3)

In [35]:
time.sleep(3)

In [36]:
html3 = browser.html

In [37]:
mars_weather = BeautifulSoup(html3, 'html.parser')

In [38]:
mars_weather.find_all('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") 

[<p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" data-aria-label-part="0" lang="en">Congratulations to our orbiting companion! 
 <a class="twitter-atreply pretty-link js-nav" data-mentioned-user-id="11348282" dir="ltr" href="/NASA"><s>@</s><b>NASA</b></a>'s MRO Completes 60,000 Trips Around <a class="twitter-hashtag pretty-link js-nav" data-query-source="hashtag_click" dir="ltr" href="/hashtag/Mars?src=hash"><s>#</s><b>Mars</b></a>: <a class="twitter-timeline-link" data-expanded-url="https://go.nasa.gov/2w0c36K" dir="ltr" href="https://t.co/4vrKCzKY6r" rel="nofollow noopener" target="_blank" title="https://go.nasa.gov/2w0c36K"><span class="tco-ellipsis"></span><span class="invisible">https://</span><span class="js-display-url">go.nasa.gov/2w0c36K</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible"> </span></span></a>.<a class="twitter-timeline-link u-hidden" data-pre-embedded="true" dir="ltr" href="https://t.co/Q0jtKuxBK5">pic

In [39]:
tweet_mars_weather = mars_weather.body.find_all('p')[5].text
print(tweet_mars_weather)

Congratulations to our orbiting companion! 
@NASA's MRO Completes 60,000 Trips Around #Mars: https://go.nasa.gov/2w0c36K .pic.twitter.com/Q0jtKuxBK5


In [40]:
#Add weather tweet to the mars_info dict.
mars_info["Mars_Weather_Tweet"] = tweet_mars_weather

In [41]:
pprint(mars_info)

{'Mars_Full_Image': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18905_hires.jpg',
 'Mars_News_Body': 'The orbiting spacecraft is also about to set a record for '
                   'data relayed from the Martian surface.',
 'Mars_News_Title': "NASA's MRO Completes 60,000 Trips Around Mars",
 'Mars_Weather_Tweet': 'Congratulations to our orbiting companion! \n'
                       "@NASA's MRO Completes 60,000 Trips Around #Mars: "
                       'https://go.nasa.gov/2w0c36K\xa0'
                       '.pic.twitter.com/Q0jtKuxBK5'}


# Part 4. ### Mars Facts¶

In [42]:
url4 = "https://space-facts.com/mars/"

In [43]:
tables = pd.read_html(url4)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [44]:
df = tables[0]
df.columns = ["0", "1"]
df.columns = ["Mars Profile", "Fact Data"]
df.head(10)

Unnamed: 0,Mars Profile,Fact Data
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [45]:
# Pandas also had a to_html method that we can use to generate HTML tables from DataFrames.
html_table = df.to_html()
html_table


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars Profile</th>\n      <th>Fact Data</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd mille

In [46]:
# You may have to strip unwanted newlines to clean up the table.
html_table = html_table.replace('\n', '')


In [47]:
html_table

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars Profile</th>      <th>Fact Data</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>5</th>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>6</th>      <td>Surface Temperature:</td>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>7</th>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>8</th>      <td>Recorded By:</td>      <td>Egyptian as

In [48]:
mars_info["Mars_Facts_Table"] = html_table

In [49]:
# You can also save the table directly to a file.

df.to_html('table.html')

# Part 5. ### Mars Hemispheres¶

In [50]:
mars_hemis = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

browser.visit(mars_hemis)
html=browser.html
soup = BeautifulSoup(html,"html.parser")
#print(soup.prettify())


In [None]:
 #Retreive all items that contain mars hemispheres information
items = soup.find_all('div', class_='item')

# Create empty list for hemisphere urls 
hemisphere_image_urls = []

# Store the main_ul 
hemispheres_main_url = 'https://astrogeology.usgs.gov'

for i in items:
   # Store title
    title = i.find('h3').text
    
    # Store link that leads to full image website
    partial_img_url = i.find('a', class_='itemLink product-item')['href']
    
    # Visit the link that contains the full image website 
    browser.visit(hemispheres_main_url + partial_img_url)
    
    # HTML Object of individual hemisphere information website 
    partial_img_html = browser.html
    
    # Parse HTML with Beautiful Soup for every individual hemisphere information website 
    soup = BeautifulSoup( partial_img_html, 'html.parser')
    
    # Retrieve full image source 
    img_url = hemispheres_main_url + soup.find('img', class_='wide-image')['src']
    
    # Append the retreived information into a list of dictionaries 
    hemisphere_image_urls.append({"title" : title, "img_url" : img_url})
    

# Display hemisphere_image_urls
hemisphere_image_urls

In [52]:
# html5 = browser.html

In [53]:
hemisphere_list = []
links = browser.find_by_css("a.product-item h3")

In [54]:
print(links)

[<splinter.driver.webdriver.WebDriverElement object at 0x107611e80>, <splinter.driver.webdriver.WebDriverElement object at 0x107611668>, <splinter.driver.webdriver.WebDriverElement object at 0x107611cf8>, <splinter.driver.webdriver.WebDriverElement object at 0x107611518>]


In [55]:
for i in range(len(links)):
    hemisphere = {}
    print(i)

    # We have to find the elements on each loop to avoid a stale element exception
    browser.find_by_css("a.product-item h3")[i].click()

    # Next, we find the Sample image anchor tag and extract the href
    sample_elem = browser.find_link_by_text('Sample').first
    hemisphere['img_url'] = sample_elem['href']

    # Get Hemisphere title
    hemisphere['title'] = browser.find_by_css("h2.title").text

    # Append hemisphere object to list
    hemisphere_list.append(hemisphere)
    print(hemisphere_list)

    # Finally, we navigate backwards
    browser.back()

0
[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}]
1
[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'title': 'Schiaparelli Hemisphere Enhanced'}]
2
[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'title': 'Schiaparelli Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg', 'title': 'Syrtis Major Hemisphere Enhanced'}]
3
[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Vik

In [56]:
print(hemisphere_list)

[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'title': 'Schiaparelli Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg', 'title': 'Syrtis Major Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg', 'title': 'Valles Marineris Hemisphere Enhanced'}]


In [57]:
#Add hemispheres list  to the mars_info dictionary.
mars_info["Hemisphere_Image_URLs"] = hemisphere_list

In [58]:
mars_info

{'Mars_News_Title': "NASA's MRO Completes 60,000 Trips Around Mars",
 'Mars_News_Body': 'The orbiting spacecraft is also about to set a record for data relayed from the Martian surface.',
 'Mars_Full_Image': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18905_hires.jpg',
 'Mars_Weather_Tweet': "Congratulations to our orbiting companion! \n@NASA's MRO Completes 60,000 Trips Around #Mars: https://go.nasa.gov/2w0c36K\xa0.pic.twitter.com/Q0jtKuxBK5",
 'Mars_Facts_Table': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars Profile</th>      <th>Fact Data</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 (Phobos &a

In [59]:
#Generate date time and store in the dictionary.
now_datetime = datetime.datetime.utcnow()
mars_info["Date_Time"] = now_datetime

In [60]:
pprint(mars_info)

{'Date_Time': datetime.datetime(2019, 5, 16, 19, 43, 0, 536961),
 'Hemisphere_Image_URLs': [{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
                            'title': 'Cerberus Hemisphere Enhanced'},
                           {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
                            'title': 'Schiaparelli Hemisphere Enhanced'},
                           {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
                            'title': 'Syrtis Major Hemisphere Enhanced'},
                           {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
                            'title': 'Valles Marineris Hemisphere Enhanced'}],
 'Mars_Facts_Table': '<table border="1" class="dataframe">  <thead>    <tr '
         

In [66]:
mars_info["Hemisphere_Image_URLs"][0]

{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
 'title': 'Cerberus Hemisphere Enhanced'}