# Dependencies

In [37]:
!pip install monthdelta

[33mYou are using pip version 10.0.1, however version 19.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [38]:
# Import Package
import bs4 as bs
import urllib.request
import pandas as pd
import time
import datetime
import monthdelta

In [39]:
# Chrome Driver
# executable_path = {'executable_path':'/usr/local/bin/chromedriver'}
# browser = Browser('chrome',**executable_path,headless=False)

# Web Scraping Yahoo

## Web Scraping Function
* Input: 
    * a) Ticker Name, 
    * b) Start date, in datetime format; 
    * c) End date, in datetime format; 
    * d) Optional, data frequency:
        * d.1) 1d (every business day); 
        * d.2) 1wk (every week); 
        * d.3) 1mo (every month)
* Output: Dataframe with the following column names:
    * 1) Open Price
    * 2) High Price
    * 3) Low Price
    * 4) Close Price
    * 5) Adj Close Price
    * 6) Volume

In [44]:
def get_historical_price(ticker, date1, date2, frequency='1d',display=True):
 
    format_string='%Y-%m-%d %H:%M:%S'
 
    # One day (86400 second) adjustment required to get dates printed to match web site manual output
    _date1 = date1.strftime("%Y-%m-%d 00:00:00")
    date1_epoch = str(int(time.mktime(time.strptime(_date1, format_string)))- 86400)
    
    if display == True: 
        print("")
        print(date1, date1_epoch, " + 86,400 = ", str(int(date1_epoch) + 86400))
 
    _date2 = date2.strftime("%Y-%m-%d 00:00:00")
    date2_epoch = str(int(time.mktime(time.strptime(_date2, format_string))))
    
    if display == True:
        print(date2, date2_epoch)
 
    url = 'https://finance.yahoo.com/quote/' + ticker + '/history?period1=' + date1_epoch + '&period2=' + date2_epoch + '&interval='+frequency+'&filter=history&frequency='+frequency
    source = urllib.request.urlopen(url).read()      
    soup = bs.BeautifulSoup(source,'lxml')
    table_rows = soup.find_all('tr')
      
    extract_table = []
      
    for table_row in table_rows:
        table_row_values = table_row.find_all('td')
        extract_row = [i.text for i in table_row_values]
        extract_table.append(extract_row)        
      
    column_names = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
  
    # data = data[1:-2]
    extract_table_df = pd.DataFrame(extract_table)
    extract_table_df.columns = column_names
    extract_table_df.set_index(column_names[0], inplace=True)
    extract_table_df = extract_table_df.convert_objects(convert_numeric=True)
    extract_table_df = extract_table_df.iloc[::-1]
    extract_table_df.dropna(inplace=True)
      
    return extract_table_df

## Web Scraping Function Trial with Apple Stock

In [41]:
## Try Scraping with APPL. 

# Initialize the end date to be today and start date is one year before. 
print("")
print("")
start_date = datetime.date(2018, 6, 2)
end_date = datetime.date(2019, 2, 19)
today = datetime.date.today()
 
# The statements in this group are for debugging purposes only
format_string='%Y-%m-%d %H:%M:%S'
t1 = start_date.strftime("%Y-%m-%d 00:00:00")
t2 = end_date.strftime("%Y-%m-%d 00:00:00")
start_date_epoch = str(int(time.mktime(time.strptime(t1, format_string))))
end_date_epoch = str(int(time.mktime(time.strptime(t2,format_string))))
 
# Output all 'original' dates
print('Today     :', today)
print('Start Date:', start_date, 'Start Date Epoch:', start_date_epoch)
print('End   Date:', end_date,   'End   Date Epoch:', end_date_epoch)
 
# Initialize 'date1'
date1 = start_date
 
# Initialize 'date1'
date1 = start_date
 
# Do not allow the 'End Date' to be AFTER today
if today < end_date:
    end_date = today

iteration_number = 0
while date1 <= end_date:
    iteration_number += 1
 
    # Create 'date2' in a 60 day Window or less
    date2 = date1 + monthdelta.monthdelta(2)
    date2 = datetime.date(date2.year, date2.month, 1)
    date2 = date2 - datetime.timedelta(days=1)
         
    # Do not allow 'date2' to go beyond the 'End Date'
    if date2 > end_date:
        date2 = end_date
         
    print(f"Processing {date1} thru {date2}.")
    stock_symbol = 'AAPL'
    df = get_historical_price(stock_symbol, date1, date2)
     
    if iteration_number == 1:
        dfall = df.copy()
    else:
        frames = [dfall, df]
        dfall = pd.concat(frames)
 
    # # # print(dfall)
    # # # print("len of dfall = {}".format(len(dfall)))
 
    # Increment the first date for the next pass
    date1 = date1   + monthdelta.monthdelta(2)
    date1 = datetime.date(date1.year, date1.month, 1)

# Output concatenated data set
print(dfall)
print(f"len of whole extracted data set = {len(dfall)}")



Today     : 2019-02-19
Start Date: 2018-06-02 Start Date Epoch: 1527915600
End   Date: 2019-02-19 End   Date Epoch: 1550556000
Processing 2018-06-02 thru 2018-07-31.

2018-06-02 1527829200  + 86,400 =  1527915600
2018-07-31 1533013200


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Processing 2018-08-01 thru 2018-09-30.

2018-08-01 1533013200  + 86,400 =  1533099600
2018-09-30 1538283600
Processing 2018-10-01 thru 2018-11-30.

2018-10-01 1538283600  + 86,400 =  1538370000
2018-11-30 1543557600
Processing 2018-12-01 thru 2019-01-31.

2018-12-01 1543557600  + 86,400 =  1543644000
2019-01-31 1548914400
Processing 2019-02-01 thru 2019-02-19.

2019-02-01 1548914400  + 86,400 =  1549000800
2019-02-19 1550556000
                Open    High     Low   Close  Adj Close      Volume
Date                                                               
Jun 01, 2018  187.99  190.26  187.75  190.24     188.11  23,442,500
Jun 04, 2018  191.64  193.42  191.35  191.83     189.68  26,266,200
Jun 05, 2018  193.07  193.94  192.36  193.31     191.14  21,566,000
Jun 06, 2018  193.63  194.08  191.92  193.98     191.81  20,933,600
Jun 07, 2018  194.14  194.20  192.34  193.46     191.29  21,347,200
Jun 08, 2018  191.17  192.00  189.77  191.70     189.55  26,656,800
Jun 11, 2018  191.35  19

## Web Scraping with Short and Long Term Bond ETFs

### ETF Tickers
* Long Term Bond ETFs:
    * IEF --> iShares Barclays 7-10 Year Trasry Bnd Fd
    * DTYL --> BARCLAY BK IPAT US TR 10 YR BULL ETN
    * EDV --> VANGUARD WORLD/EXTD DURATION TREAS
    * TLH --> iShares 10-20 Year Treasury Bond ETF
* Long Term Bond ETFs:
    * SHV --> iShares Short Treasury Bond ETF 
    * VGSH --> Vanguard Short-Term Treasury ETF
    * SCHR --> Schwab Intermediate-Term US Trs ETF

In [42]:
Short_TR_Tickers = ['IEF','DTYL','EDV','TLH']
Long_TR_Tickers = ['SHV','VGSH','SCHR']

### Loop through each ticker to scrape data, Scrape from 2000 to today.  

In [45]:
# Scrape Short Term Bonds
Short_TR_Data_Dict = {}

for ticker in Short_TR_Tickers:
    
    # Set up start and end date
    start_date = datetime.date(2000, 1, 2)
    end_date = datetime.date.today()
    
    print("")
    print("")
    print ('Scraping historical data from ',start_date,' to ',end_date)
    print (f'Scraping ticker is {ticker}')
   
    ticker_df = get_historical_price(ticker, start_date, end_date, '1d', False)
    
    # Drop the data that has 
    ticker_df.dropna(inplace=True)
    
    print (f'Number of data extracted = {len(ticker_df)}')
    
    Short_TR_Data_Dict = {**Short_TR_Data_Dict,**{ticker:ticker_df}}



Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is IEF


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Number of data extracted = 96


Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is DTYL
Number of data extracted = 9


Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is EDV
Number of data extracted = 99


Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is TLH
Number of data extracted = 96


In [46]:
# Scrape Long Term Bonds
Long_TR_Data_Dict = {}

for ticker in Long_TR_Tickers:
    
    # Set up start and end date
    start_date = datetime.date(2000, 1, 2)
    end_date = datetime.date.today()
    
    print("")
    print("")
    print ('Scraping historical data from ',start_date,' to ',end_date)
    print (f'Scraping ticker is {ticker}')
   
    ticker_df = get_historical_price(ticker, start_date, end_date, '1d',False)
    
    # Drop the data that has 
    ticker_df.dropna(inplace=True)
    
    print (f'Number of data extracted = {len(ticker_df)}')
    
    Long_TR_Data_Dict = {**Long_TR_Data_Dict,**{ticker:ticker_df}}



Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is SHV


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Number of data extracted = 96


Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is VGSH
Number of data extracted = 96


Scraping historical data from  2000-01-02  to  2019-02-19
Scraping ticker is SCHR
Number of data extracted = 96
