# Mission to Mars

### Import Dependencies

In [24]:
import pandas as pd
import pymongo as pmo
from bs4 import BeautifulSoup as bsp
from splinter import Browser
import time
import os
import os.path
from os import path


### Set-up Web Browser Driver for Scraping

In [2]:
# specify the path of browser driver want to use
executable_path = {'executable_path': 'chromedriver.exe'}

# specify the name of browser want to use
browser_name = 'chrome'

# specify parser used
lib_used = 'html.parser'

# start browser
browser = Browser(browser_name, **executable_path, headless=False)

### NASA Mars News Web Scraping

In [3]:
# link to NASA Mars news
article_url = 'https://mars.nasa.gov/news/'

# access & get content 
browser.visit(article_url)

t_wait = 0
del_t = 0.25

# condition to make sure the webpage is loaded
if browser.is_element_present_by_tag('/html') == False:
    time.sleep(t_wait)
    t_wait += del_t
else:
    pass

soup = bsp(browser.html, lib_used)

# return results
results = soup.find_all('div', class_='list_text')
results

[<div class="list_text"><div class="list_date">April 21, 2020</div><div class="content_title"><a href="/news/8654/how-nasas-perseverance-mars-team-adjusted-to-work-in-the-time-of-coronavirus/" target="_self">How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus </a></div><div class="article_teaser_body">Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.</div></div>,
 <div class="list_text"><div class="list_date">April 20, 2020</div><div class="content_title"><a href="/news/8649/nasas-perseverance-mars-rover-gets-balanced/" target="_self">NASA's Perseverance Mars Rover Gets Balanced</a></div><div class="article_teaser_body">The mission team performed a crucial weight-balancing test on the rover in preparation for this summer's history-making launch to the Red Planet.</div></div>,
 <div class="list_text"><div class="list_date">April 1

In [4]:
# pull the lastest news from the list with index = 0 :: indication of the top latest
latest_news = results[0]
latest_news

<div class="list_text"><div class="list_date">April 21, 2020</div><div class="content_title"><a href="/news/8654/how-nasas-perseverance-mars-team-adjusted-to-work-in-the-time-of-coronavirus/" target="_self">How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus </a></div><div class="article_teaser_body">Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.</div></div>

In [5]:
# workflow: 
    # find all the div, then use the unique class of each div  
    # to access the content of a specific div
for tag in latest_news.find_all('div'):
    if "content_title" in tag.attrs["class"]:
        las_news_title = tag.a.text
        las_news_link = f"https://mars.nasa.gov/{tag.a['href']}"
    elif "article_teaser_body" in tag.attrs["class"]:
        las_news_content = tag.text
     
        
# print out what found in the loop
print(f'>> Lastest news of Mars from NASA:\n\
    {las_news_title}\n\n\
>> News Content:\n\
    {las_news_content}\n\n\
>> News Link:\n\
    {las_news_link}')

>> Lastest news of Mars from NASA:
    How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus 

>> News Content:
    Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.

>> News Link:
    https://mars.nasa.gov//news/8654/how-nasas-perseverance-mars-team-adjusted-to-work-in-the-time-of-coronavirus/


### JPL Mars Space Images - Featured Image Web Scraping

In [6]:
# link to Mars Image
ft_img_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

# access website
browser.visit(ft_img_url)

t_wait = 0
del_t = 0.25

# condition to make sure the webpage is loaded
if browser.is_element_present_by_tag('/html') == False:
    time.sleep(t_wait)
    t_wait += del_t
else:
    pass


# click on couple of buttons to gain access to the full size image page
# condition with if to make sure the subsequent codes will still run incase of issue
if browser.links.find_by_partial_text('FULL IMAGE'):
    browser.links.find_by_partial_text('FULL IMAGE').click()
    
else:
    print(f'No "FULL IMAGE" Button found')
  
    
if browser.links.find_by_partial_text('more info'):
    browser.links.find_by_partial_text('more info').click()
    
    # delay time so browser can load before proceeding
    time.sleep(2)
    
else:
     print(f'No "more info" Button found')
        
# condition with if to make sure the subsequent codes will still run incase of issue
if browser.links.find_by_partial_href('largesize'):
    browser.links.find_by_partial_href('largesize').click()
    
else:
    print(f'No "Full size Image " Button found')
    
feature_image_url = browser.url
feature_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16217_hires.jpg'

### Mars Weather Web Scraping

In [7]:
# link to Mars weather
weather_url = 'https://twitter.com/marswxreport?lang=en'

# access & get content 
browser.visit(weather_url)

t_wait = 0
del_t = 0.25

# condition to make sure the webpage is loaded
if browser.is_element_present_by_tag('/html') == False:
    time.sleep(t_wait)
    t_wait += del_t
else:
    pass

# create soup object
soup = bsp(browser.html, lib_used)
print(soup)

# return results
results = soup.find_all('span', class_='css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0')
results

<html dir="ltr" lang="en" style="font-size: 15px;"><head><meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=0,viewport-fit=cover" name="viewport"/>
<link href="//abs.twimg.com" rel="preconnect"/>
<link href="//api.twitter.com" rel="preconnect"/>
<link href="//pbs.twimg.com" rel="preconnect"/>
<link href="//t.co" rel="preconnect"/>
<link href="//video.twimg.com" rel="preconnect"/>
<link href="//abs.twimg.com" rel="dns-prefetch"/>
<link href="//api.twitter.com" rel="dns-prefetch"/>
<link href="//pbs.twimg.com" rel="dns-prefetch"/>
<link href="//t.co" rel="dns-prefetch"/>
<link href="//video.twimg.com" rel="dns-prefetch"/>
<link as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/web/polyfills.bab5fe74.js" nonce="" rel="preload"/>
<link as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/web/vendors~main.e0482f54.js" nonce="" rel="preload"/>
<link as="script" crossorigin="anonymous" 

[<span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Log in</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Sign up</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">See new Tweets</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Follow</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"><span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Mars Weather</span></span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Mars Weather</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">@MarsWxReport</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Updates as avail from the REMS weather instrument aboard </span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">.  Data credit: Centro deAstrobiologia, FMI, 

In [8]:
# loop thru the list and find partial match for the weather content
# as soon as the first string read, stop the loop
for ea_tag in results:
    if ea_tag.text[0:7] == "InSight":
        mars_weather = ea_tag.text
        break
mars_weather

'InSight sol 499 (2020-04-22) low -94.4ºC (-137.9ºF) high -3.6ºC (25.5ºF)\nwinds from the SW at 5.1 m/s (11.3 mph) gusting to 16.2 m/s (36.1 mph)\npressure at 6.70 hPa'

### Mars Facts

In [9]:
# link to Mars weather
facts_url = 'https://space-facts.com/mars/'

# access & get content 
browser.visit(facts_url)
time.sleep(3)   
# soup = bsp(browser.html, lib_used)



# return results
# results = soup.find_all('span', class_='css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0')
# results



In [10]:
tables = pd.read_html(facts_url)
tables[0]
df = tables[0]
df.columns = ["Description", "Value"]
df

Unnamed: 0,Description,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [11]:
mars_info_table = df.to_html(index=False)
mars_info_table.replace("\n", "")

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th>Description</th>      <th>Value</th>    </tr>  </thead>  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [12]:
df.to_html('mars_info_table.html')

### Mars Hemispheres

In [13]:
# create function to process webpage and retrieve hemisphere images of Mars

def get_hemi_img(brwr, hemi_url, hemi_name):
    
    # since browser will take sometime to load,
    # create timer to delay the process and wait for chrome to load page
    # set wait time parameters
    t_wait = 0
    t_out = 15
    del_t = 1
    
    # access & get content by soup object
    browser.visit(hemi_url)
    soup = bsp(browser.html, lib_used)
    
    print(">> Progress = 10%")
    
    # condition to make sure the webpage is loaded
    if browser.is_element_present_by_tag('/html') == False:
        time.sleep(t_wait)
        t_wait += del_t
    else:
        pass

    print(">> Progress = 30%")
    
    # click on couple of buttons to gain access to the full size image page
    # condition with if to make sure the subsequent codes will still run incase of issue
    t_wait = 0
    if brwr.links.find_by_partial_text(hemi_name):
        brwr.links.find_by_partial_text(hemi_name).click()
        # condition to make sure the webpage is loaded
        print(">> Progress = 50%")
        if brwr.is_element_present_by_tag('/html') == False:
            if t_wait <= t_out:
                time.sleep(t_wait)
                t_wait += del_t
        else:
            print("Page takes too long to load!")
            pass

    else:
        print(f"Unable to find {hemi_name} Hemisphere Image")
        pass
    
    print(">> Progress = 60%")
    # create soup object
    time.sleep(3)
    soup = bsp(browser.html, lib_used)
    time.sleep(4)
    
    # return results
    results = soup.find_all('ul', class_='')
    time.sleep(4)
    t_wait = 0
    
    while not results:              
        if t_wait <= t_out:
            time.sleep(t_wait)
            results = soup.find_all('ul', class_='')
            t_wait += del_t
        else:
            print("Time out!")
            pass
    
    print(">> Progress = 70%")
    
    # loop to find the tag a and href of the full size image
    try:
        for rsl in results[0].find_all('a'):
            if rsl.attrs["href"] and (rsl.text).lower() == "sample":
                print('Getting Image Link')
                img_link = rsl['href']
                print('Getting Image Title')
                img_title = hemi_name + ' Hemisphere'
    except Exception:
        print(f"Unable to process {hemi_name} Hemisphere Data")
        pass
    print(">> Progress = 100%")
    return (img_title, img_link)

In [14]:
# link to Mars hemisphere pics
hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
hemi_name = ['Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris']

In [15]:
# get the list of dictionary of all hemisphere image info
hemisphere_image_urls = []
for name in hemi_name:
    print(f'>> Processing data of {name} Hemisphere')
    print(f'>> Please wait ...')
    answer = get_hemi_img(browser, hemi_url, name)
    dict_each_hemi = {"title": answer[0] , "img_url": answer[1]}
    hemisphere_image_urls.append(dict_each_hemi)
    print(f'>> Finish with {name} Hemisphere\n{("-")*25}')
hemisphere_image_urls               

>> Processing data of Cerberus Hemisphere
>> Please wait ...
>> Progress = 10%
>> Progress = 30%
>> Progress = 50%
>> Progress = 60%
>> Progress = 70%
Getting Image Link
Getting Image Title
>> Progress = 100%
>> Finish with Cerberus Hemisphere
-------------------------
>> Processing data of Schiaparelli Hemisphere
>> Please wait ...
>> Progress = 10%
>> Progress = 30%
>> Progress = 50%
>> Progress = 60%
>> Progress = 70%
Getting Image Link
Getting Image Title
>> Progress = 100%
>> Finish with Schiaparelli Hemisphere
-------------------------
>> Processing data of Syrtis Major Hemisphere
>> Please wait ...
>> Progress = 10%
>> Progress = 30%
>> Progress = 50%
>> Progress = 60%
>> Progress = 70%
Getting Image Link
Getting Image Title
>> Progress = 100%
>> Finish with Syrtis Major Hemisphere
-------------------------
>> Processing data of Valles Marineris Hemisphere
>> Please wait ...
>> Progress = 10%
>> Progress = 30%
>> Progress = 50%
>> Progress = 60%
>> Progress = 70%
Getting Image L

[{'title': 'Cerberus Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [16]:
# pd.DataFrame(hemisphere_image_urls)

## Summary of all Scraped Data

In [17]:
print(("=")*35, "START", ("=")*35)
print(las_news_title, "\n", ("-")*50)
print(las_news_content, "\n", ("-")*50)
print(las_news_link, "\n", ("-")*50)
print (feature_image_url, "\n", ("-")*50)
print (mars_weather, "\n", ("-")*50)
print(hemisphere_image_urls, "\n", ("-")*50)
print(" >>> Closing Browser...", "\n", ("-")*50)
try:
    browser.quit()
except Exception:
    pass
print(" >>> Browser Closed...", "\n", ("-")*50)
print(("=")*35, "END", ("=")*35)

How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus  
 --------------------------------------------------
Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first. 
 --------------------------------------------------
https://mars.nasa.gov//news/8654/how-nasas-perseverance-mars-team-adjusted-to-work-in-the-time-of-coronavirus/ 
 --------------------------------------------------
https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16217_hires.jpg 
 --------------------------------------------------
InSight sol 499 (2020-04-22) low -94.4ºC (-137.9ºF) high -3.6ºC (25.5ºF)
winds from the SW at 5.1 m/s (11.3 mph) gusting to 16.2 m/s (36.1 mph)
pressure at 6.70 hPa 
 --------------------------------------------------
[{'title': 'Cerberus Hemisphere', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced

### Export Jupyter Notebook to Python File

In [28]:
# define file name
python_file_name = 'scrape.py'

# if there is alreadt old 'scrape.py', then delete and reprocess a new one
if path.exists('scrape.py'):
    os.remove('scrape.py')

# if exception raises, just skip the export process
try:
    !jupyter nbconvert --to python mission_to_mars.ipynb
    os.rename("mission_to_mars.py", "scrape.py")
except Exception:
    print(Exception)
    pass


[NbConvertApp] Converting notebook mission_to_mars.ipynb to python
[NbConvertApp] Writing 8639 bytes to mission_to_mars.py
