In [1]:
#https://www.scrapehero.com/scrape-yahoo-finance-stock-market-data/
#Construct the URL of the search results page from Yahoo Finance. 
#   For example, here is the one for Apple-http://finance.yahoo.com/quote/AAPL?p=AAPL
#Step 1) Download HTML of the search result page using Python Requests
#Step 2) Inspect Data Source via web browser (IE) and Python (Print, BS4 Lib)
#Step 3) Parse the page using BS4 HTML Parser + HTML Tag Information
#Step 4) Save the data for Storage - CSV & JSON files.

import requests  #GET/POST/PUT API requests
from contextlib import closing  #utilities for common tasks involving the "with" statement.
from bs4 import BeautifulSoup #BeautifulSoup4 - HTML Web Scraping #Scrapy
import csv #write stock information to csv file
import json #write stock informaton to json file, for future API use
#more info on bs4:  https://realpython.com/python-web-scraping-practical-introduction/

In [2]:
#Attempts to get the content at `url` by making an HTTP GET request.
#If the content-type of response is some kind of HTML/XML, return the
#text content, otherwise return None.
def simple_get(url):
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                print('http request successful')
                return resp.content
            else:
                return None

    except RequestException as e:
        print('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [3]:
#Returns True if the response seems to be HTML, False otherwise.
def is_good_response(resp):

    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [4]:
#Web Scrape - bloomberg stock market data - S&P 500 example:  https://www.bloomberg.com/quote/SPX:IND
raw_html = simple_get('https://finance.yahoo.com/quote/AAPL/history?p=AAPL')

http request successful


In [5]:
#using BeautifulSoap to scrape and inspect web page
html = BeautifulSoup(raw_html, 'html.parser')
#print(html.prettify())

In [6]:
# Take out the <div> of name and get the stock's value
stock_date = html.findAll("td", attrs={"class": "Py(10px) Ta(start) Pend(10px)"})
#print(stock_date)

In [7]:
#iterate through elements to get stock dates
stock_date = []
for i, text in enumerate(html.findAll("td", attrs={"class": "Py(10px) Ta(start) Pend(10px)"})):
    stock_date.insert(i,text.span.string)
#print(stock_date)

#create list to store other stock data
stock_data = []
for i, text in enumerate(html.findAll("td", attrs={"class": "Py(10px) Pstart(10px)"})):
    stock_data.insert(i,text.span.string)
#print(stock_data)

In [8]:
#combine date and stock information into dictionary data structure
i = 0
stock_info = {}
for i in range(len(stock_date)-1):  #get stock info for every day
    stock_info[i] = {"date": stock_date[i], "open": stock_data[(i*6)+0], "close": stock_data[(i*6)+3], "volume": stock_data[(i*6)+5]}
    print(stock_info[i])

{'date': 'Feb 06, 2019', 'open': '174.65', 'close': '174.24', 'volume': '27,959,581'}
{'date': 'Feb 05, 2019', 'open': '172.86', 'close': '174.18', 'volume': '36,066,500'}
{'date': 'Feb 04, 2019', 'open': '167.41', 'close': '171.25', 'volume': '31,495,500'}
{'date': 'Feb 01, 2019', 'open': '166.96', 'close': '166.52', 'volume': '32,668,100'}
{'date': 'Jan 31, 2019', 'open': '166.11', 'close': '166.44', 'volume': '40,739,600'}
{'date': 'Jan 30, 2019', 'open': '163.25', 'close': '165.25', 'volume': '61,109,800'}
{'date': 'Jan 29, 2019', 'open': '156.25', 'close': '154.68', 'volume': '41,587,200'}
{'date': 'Jan 28, 2019', 'open': '155.79', 'close': '156.30', 'volume': '26,192,100'}
{'date': 'Jan 25, 2019', 'open': '155.48', 'close': '157.76', 'volume': '33,535,500'}
{'date': 'Jan 24, 2019', 'open': '154.11', 'close': '152.70', 'volume': '25,441,500'}
{'date': 'Jan 23, 2019', 'open': '154.15', 'close': '153.92', 'volume': '23,130,600'}
{'date': 'Jan 22, 2019', 'open': '156.41', 'close': '1

In [9]:
#export stock information to csv
i = 0
with open('C:\\Python\\Data\\stock_hist.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, stock_info[i].keys(), lineterminator = '\n')
    w.writeheader() 
    while(i<len(stock_info)):
        w.writerow(stock_info[i])
        i += 1

In [10]:
#export stock information to json
jsonarray = json.dumps(stock_info)
print(jsonarray)

#write json file to local drive, for future use
with open('C:\\Python\\Data\\stock_hist.json', 'w') as f:
    json.dump(jsonarray, f)

{"0": {"date": "Feb 06, 2019", "open": "174.65", "close": "174.24", "volume": "27,959,581"}, "1": {"date": "Feb 05, 2019", "open": "172.86", "close": "174.18", "volume": "36,066,500"}, "2": {"date": "Feb 04, 2019", "open": "167.41", "close": "171.25", "volume": "31,495,500"}, "3": {"date": "Feb 01, 2019", "open": "166.96", "close": "166.52", "volume": "32,668,100"}, "4": {"date": "Jan 31, 2019", "open": "166.11", "close": "166.44", "volume": "40,739,600"}, "5": {"date": "Jan 30, 2019", "open": "163.25", "close": "165.25", "volume": "61,109,800"}, "6": {"date": "Jan 29, 2019", "open": "156.25", "close": "154.68", "volume": "41,587,200"}, "7": {"date": "Jan 28, 2019", "open": "155.79", "close": "156.30", "volume": "26,192,100"}, "8": {"date": "Jan 25, 2019", "open": "155.48", "close": "157.76", "volume": "33,535,500"}, "9": {"date": "Jan 24, 2019", "open": "154.11", "close": "152.70", "volume": "25,441,500"}, "10": {"date": "Jan 23, 2019", "open": "154.15", "close": "153.92", "volume": "