-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
83 lines (75 loc) · 2.85 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Dependancies
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
# Define function
def scrape():
# Create dictionary to return
return_dict = {}
# Create initial browser object
executable_path = {'executable_path': '/Users/joshchung/Bootcamp/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
# Scrape NASA Mars news
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
html = browser.html
soup = bs(html, 'lxml')
results = soup.find('li', class_="slide")
article_date = results.find('div', class_="list_date").text
article_title = results.find('div', class_="content_title").text
article_teaser = results.find('div', class_="article_teaser_body").text
return_dict.update({'article_date':article_date,
'article_title':article_title,
'article_teaser':article_teaser})
# Scrape JPL image
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
html = browser.html
soup = bs(html, 'lxml')
results = soup.find_all('article', class_="carousel_item")
url_string = results[0].get('style')
url_string = url_string.split("url('")
url_string = url_string[1].split("');")
url_string = url_string[0]
img_url = 'https://www.jpl.nasa.gov' + url_string
return_dict.update({'img_url':img_url})
# Scrape Twitter
url = 'https://twitter.com/marswxreport'
browser.visit(url)
html = browser.html
soup = bs(html, 'lxml')
last_tweet = soup.find('p', class_="tweet-text").text
last_tweet = last_tweet.replace('\n', ' ')
return_dict.update({'last_tweet':last_tweet})
# Scrape Mars facts
url = 'https://space-facts.com/mars/'
tables = pd.read_html(url)
mars_df = tables[0]
mars_df.columns = ['Statistic','Values']
mars_df = mars_df.set_index('Statistic')
mars_table = mars_df.to_html()
mars_table = mars_table.replace('\n', '')
return_dict.update({'mars_table':mars_table})
# Scrape Mars hemisphere images
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
mars_urls = {}
for x in range(0,4):
browser.visit(url)
links = browser.find_by_tag('h3')
links[x].click()
html = browser.html
soup = bs(html, 'lxml')
downloads = soup.find('div', class_="downloads")
dl_links = downloads.find_all('a')
img_link = dl_links[0].get('href')
dld_link = dl_links[1].get('href')
title = soup.find('h2', class_="title").text
mars_urls.update({
f"marsimg_{x}" : img_link,
f"marstitle_{x}": title,
f"marsdld_{x}": dld_link
})
browser.back()
return_dict.update(mars_urls)
# Return dictionary when function is run
return return_dict