## CODE1: Get data from 

In [1]:
import os
import time
import requests
import lxml
from lxml import html
from datetime import datetime, timedelta
import pandas as pd

def format_date(date_datetime):
     date_timetuple = date_datetime.timetuple()
     date_mktime = time.mktime(date_timetuple)
     date_int = int(date_mktime)
     date_str = str(date_int)
     return date_str

def subdomain(symbol, start, end, filter='history'):
     subdoma="/quote/{0}/history?period1={1}&period2={2}&interval=1d&filter={3}&frequency=1d"
     subdomain = subdoma.format(symbol, start, end, filter)
     return subdomain
 
def header_function(subdomain):
     hdrs =  {"authority": "finance.yahoo.com",
              "method": "GET",
              "path": subdomain,
              "scheme": "https",
              "accept": "text/html",
              "accept-encoding": "gzip, deflate, br",
              "accept-language": "en-US,en;q=0.9",
              "cache-control": "no-cache",
              "cookie": "Cookie:identifier",
              "dnt": "1",
              "pragma": "no-cache",
              "sec-fetch-mode": "navigate",
              "sec-fetch-site": "same-origin",
              "sec-fetch-user": "?1",
              "upgrade-insecure-requests": "1",
              "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64)"}
     return hdrs

def scrape_page(url, header):
     page = requests.get(url, headers=header)
     element_html = html.fromstring(page.content)
     table = element_html.xpath('//table')
     table_tree = lxml.etree.tostring(table[0], method='xml')
     panda = pd.read_html(table_tree)
     return panda

if __name__ == '__main__':
     symbol = 'COM7.BK'

     dt_start = datetime.today() - timedelta(days=365)
     dt_end = datetime.today()
     today = datetime.today()
    
     start = format_date(dt_start)
     end = format_date(dt_end)
     
     sub = subdomain(symbol, start, end)
     header = header_function(sub)
     
     base_url = 'https://finance.yahoo.com'
     url = base_url + sub
     price_history = scrape_page(url, header)

     # Process data
     df = pd.DataFrame(price_history[0])
     df = df[pd.to_numeric(df['Close*'], errors='coerce').notnull()]
     df['symbol'] = symbol

     # Save to csv
     save_dir = 'data/' + symbol
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
     save_path = save_dir + '/' + symbol + '_'+str(today.year)+str(today.month)+str(today.day) + '.csv' 
     df.to_csv(save_path)
     print('saved to ' + save_path)

saved to data/COM7.BK/COM7.BK_2020824.csv


## CODE2: Get data from CSV API

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from time import mktime

In [None]:
def _get_crumbs_and_cookies(stock):
    """
    get crumb and cookies for historical data csv download from yahoo finance
    
    parameters: stock - short-handle identifier of the company 
    
    returns a tuple of header, crumb and cookie
    """
    
    url = 'https://finance.yahoo.com/quote/{}/history'.format(stock)
    with requests.session():
        header = {'Connection': 'keep-alive',
                   'Expires': '-1',
                   'Upgrade-Insecure-Requests': '1',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
                   AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
                   }
        
        website = requests.get(url, headers=header)
        soup = BeautifulSoup(website.text, 'lxml')
        crumb = re.findall('"CrumbStore":{"crumb":"(.+?)"}', str(soup))

        return (header, crumb[0], website.cookies)

In [None]:
def convert_to_unix(date):
    """
    converts date to unix timestamp
    
    parameters: date - in format (dd-mm-yyyy)
    
    returns integer unix timestamp
    """
    datum = datetime.strptime(date, '%d-%m-%Y')
    
    return int(mktime(datum.timetuple()))

In [None]:
def load_csv_data(stock, interval='1d', day_begin='01-03-2018', day_end='28-03-2018'):
    """
    queries yahoo finance api to receive historical data in csv file format
    
    parameters: 
        stock - short-handle identifier of the company
        
        interval - 1d, 1wk, 1mo - daily, weekly monthly data
        
        day_begin - starting date for the historical data (format: dd-mm-yyyy)
        
        day_end - final date of the data (format: dd-mm-yyyy)
    
    returns a list of comma seperated value lines
    """
    day_begin_unix = convert_to_unix(day_begin)
    day_end_unix = convert_to_unix(day_end)
    
    header, crumb, cookies = _get_crumbs_and_cookies(stock)
    
    with requests.session():
        url = 'https://query1.finance.yahoo.com/v7/finance/download/' \
              '{stock}?period1={day_begin}&period2={day_end}&interval={interval}&events=history&crumb={crumb}' \
              .format(stock=stock, day_begin=day_begin_unix, day_end=day_end_unix, interval=interval, crumb=crumb)
                
        website = requests.get(url, headers=header, cookies=cookies)
       
        return website.text.split('\n')[:-1]

In [None]:
quote = "BTC-USD"
load_csv_data(quote)

['Date,Open,High,Low,Close,Adj Close,Volume',
 '2018-03-01,10385.000000,11052.299805,10352.700195,10951.000000,10951.000000,7317279744',
 '2018-03-02,10977.400391,11189.000000,10850.099609,11086.400391,11086.400391,7620590080',
 '2018-03-03,11101.900391,11528.200195,11002.400391,11489.700195,11489.700195,6690570240',
 '2018-03-04,11497.400391,11512.599609,11136.099609,11512.599609,11512.599609,6084149760',
 '2018-03-05,11532.400391,11704.099609,11443.900391,11573.299805,11573.299805,6468539904',
 '2018-03-06,11500.099609,11500.099609,10694.299805,10779.900391,10779.900391,6832169984',
 '2018-03-07,10803.900391,10929.500000,9692.120117,9965.570313,9965.570313,8797910016',
 '2018-03-08,9951.440430,10147.400391,9335.870117,9395.009766,9395.009766,7186089984',
 '2018-03-09,9414.690430,9466.349609,8513.030273,9337.549805,9337.549805,8704190464',
 '2018-03-10,9350.589844,9531.320313,8828.469727,8866.000000,8866.000000,5386319872',
 '2018-03-11,8852.780273,9711.889648,8607.120117,9578.629883,