In [3]:
import pandas as pd
import numpy as np

from pandas_datareader import data as pdr
from selenium import webdriver
from cryptocmd import CmcScraper

import requests
from bs4 import BeautifulSoup

from time import sleep
from random import randint
import time

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import holidays

%config InlineBackend.figure_format = 'retina'

In [4]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [5]:
def stock_returns(ticker_list, start, end):
    list_of_df = []
    not_found = []
    
    for ticker in ticker_list:
        try:
            df= pdr.get_data_yahoo(ticker, start, end)
            df = df.reset_index()
            df.rename(columns={'Date':'date'})
            df['ticker'] = [ticker] * len(df)
            list_of_df.append(df)
        except ValueError:
            print("Not Found: "+ticker)
            not_found.append(ticker)
    
    return list_of_df, not_found

In [6]:
def scrape_nasdaq(ticker_list):
    tckr_data = {}
    count = 1
    for ticker in ticker_list:
        print("Scraping count: " + str(count))
        
        url = 'https://old.nasdaq.com/symbol/'+ticker+'/historical'
        driver.get(url)
        
        # Choose 10 year data from a drop down
        data_range = driver.find_elements_by_name('ddlTimeFrame')
        
        if len(data_range)>0:
            for option in data_range[0].find_elements_by_tag_name('option'):
                if option.text == '10 Years':
                    option.click()
                    break
            time.sleep(5)
            page_source = driver.page_source
    
            #Generating a soup object to parse data from the URL
            soup = BeautifulSoup(page_source, 'lxml')
    
            #Searching for historical table
            tags = soup.find_all('div', id="historicalContainer")
    
            #Getting the data
            temp_data=[]
    
            for tag in tags:
                rows = tag.findAll('tr')
                for tr in rows:
                    cols=tr.findAll('td')
                    val = [tr.text for tr in cols]
                    temp_data.append(val)
            
            #Cleaning the data
            for i in range(len(temp_data)):
                to_process = temp_data[i]
    
                for i in range(len(to_process)):
                    temp = to_process[i]
                    temp = temp.strip()
                    to_process[i] = temp
         
            #Getting the data
            temp_data = temp_data[2:]
            
            if tckr_data.get(ticker) == None:
                tckr_data[ticker] = temp_data
        
            # Time Pause to prevent blocking
            print("Random Sleep")
            sleep(randint(2, 4))
            count+=1
        
        else:
            print("Not Found" + ticker)
        
    #Generating the frame
    date = []
    Open = []
    high = []
    low = []
    close = []
    volume = []
    ticker = []

    for key, value in tckr_data.items():
        for data in value:
            date.append(data[0].replace('/','-'))
            Open.append(data[1])
            high.append(data[2])
            low.append(data[3])
            close.append(data[4])
            volume.append(data[5].replace(',',''))
            ticker.append(key)  
            
    final_dict = {'date' : date, 'Open':Open, 'High':high, 'Low':low, 'Close':close, 'Volume':volume, 'ticker':ticker}
    
    df = pd.DataFrame(final_dict)
    cols = ['date', 'Open', 'High', 'Low', 'Close', 'Volume', 'ticker']
    df = df[cols]

    return df

In [7]:
def get_stocks_data(ticker_set, start, end):
    #Stocks data for a year from Yahoo Data
    stocks_1, not_found = stock_returns(ticker_set, start, end)
    
    if len(not_found) > 0:
    
        #Stocks_data for remaining tickers from Nasdaq
        stocks = scrape_nasdaq(not_found)
    
        #Merging them into one dataframe
        stocks_1 = pd.concat(stocks_1)

        stocks_1 = stocks_1.rename(columns={'Date' : 'date'})
        stocks_1 = stocks_1[['date', 'Open', 'High', 'Low', 'Close', 'Volume', 'ticker']]
        stocks_1['date'] = stocks_1['date'].dt.date

        stocks = stocks.rename(columns={'Ticker' : 'ticker'})
        stocks[['Open', 'High', 'Low', 'Close', 'Volume']] = stocks[['Open', 'High', 'Low', 'Close', 'Volume']].astype(float)
    
        stocks_final = pd.concat([stocks_1, stocks])
        return stocks_final
    
    else:
        return pd.concat(stocks_1)

In [8]:
tickers = ['AAPL', 'XOM' ,'VMC', 'BA', 'AMZN', 'TGT', 'WMT', 'KO', 'UNH', 'JPM', 'GOOGL', 'STT', 'MSFT', 'VZ', 'XEL', 'SPG']
df = get_stocks_data(tickers, start='2005-01-01', end='2019-08-01')

In [9]:
df = df[['Date', 'Open', 'High', 'Low', 'Close', 'ticker']]

In [10]:
df.sample(5)

Unnamed: 0,Date,Open,High,Low,Close,ticker
1373,2010-06-16,66.93,67.559998,66.599998,67.029999,BA
672,2007-09-04,19.991428,20.818571,19.977142,20.594286,AAPL
1009,2009-01-05,20.200001,20.67,20.059999,20.52,MSFT
2210,2013-10-11,433.448456,437.177185,433.083069,436.431427,GOOGL
2579,2015-04-01,48.450001,49.27,48.43,48.919998,VZ


In [11]:
df.to_csv('stocks.csv')