In [27]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [14]:
# set path for driver
executable_path = {'executable_path': ChromeDriverManager().install()}
# provide driver path to splinter 
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [C:\Users\tgrah\.wdm\drivers\chromedriver\win32\90.0.4430.24\chromedriver.exe] found in cache


In [3]:
# asign url 
url = 'https://redplanetscience.com'

# visit url with splinter browser
browser.visit(url)

# optional delay - this can help when scraping elements that are behind JS
# look for div elements tha have "list_text" attributes
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [6]:
# parse html
html = browser.html
# parse the html found and save to a variable
news_soup = soup(html, 'html.parser')
# declare parent element that we can reference later for filtering results. Note '.' is used for selecting classes so div '.' list_text selects div elements with list_text class
slide_elem = news_soup.select_one('div.list_text') # I don't totally understand this one - look up select_one method

# find the content that we want
slide_elem.find('div', class_='content_title') # Note: could asign this to a variable and ass .text at the end to return the same result as get_text() method. 

<div class="content_title">What's Mars Solar Conjunction, and Why Does It Matter?</div>

In [7]:
# get the text from the html object returned from slide_elem.find
news_title = slide_elem.find('div', class_='content_title').get_text() # Note: you can also use .text but get_text may provide more customization?
news_title

"What's Mars Solar Conjunction, and Why Does It Matter?"

In [8]:
# Soup Note: find() returns the first item found. find_all() returns all of the items that match the parameters

# get the article teaser text 
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'NASA spacecraft at Mars are going to be on their own for a few weeks when the Sun comes between Mars and Earth, interrupting communications.'

### Featured Images

In [21]:
# visit target URL - figure out why this doesn't work if you already have the window open?
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [22]:
# find the full image button for the featured image and click it
full_image_elem = browser.find_by_tag('button')[1] # there are more buttons on the page so we need to specif which one with the list index fo the button tag we want
full_image_elem.click()

In [23]:
# parse the html from the spaceimages website
html = browser.html
img_soup = soup(html, 'html.parser')

In [25]:
# pull information from the featured image
# use the image tag to get the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [26]:
# add base url to the code to create full url to the featured image
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

### Mars Facts

In [28]:
# scraping tables

# you can read html tables directly with Pandas and read_html - most of the time you are going to want to set the column headers and the index but if you don't they will just be numeric
df = pd.read_html('https://galaxyfacts-mars.com/')[0]
df.columns=['Description', 'Mars', 'Earth']
df.set_index('Description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [30]:
# take the pandas table and convert back to html with to_html which can then be embedded in a web application
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Mars - Earth Comparison</td>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Diameter:</td>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Distance from Sun:</td>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Length of Year:</td>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Temperature:</td>\n      <td>-87 to -5 °C</

In [32]:
# quit the automated broswer session. If you do not exit then the code may try to conenct to the old automation and you will get errors. 
browser.quit()