# Data Sourcing
In this notebook, we will download the data from yahoo finance. We will download the data for the following stocks:

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install yfinance
!pip install tqdm

In [8]:
import pandas as pd
import yfinance as yf

from tqdm import tqdm

## Set the Start and End Dates for the Data Sourcing

In [5]:
# Set the start and end dates
start_date = '2020-01-01'
end_date = '2023-04-15'

## Download the historical data for the 10-Year Treasury Yield

In [6]:
# Get the historical data for the 10-Year Treasury Yield
ticker = '^TNX'
data = yf.download(ticker, start=start_date, end=end_date)

# Extract the 'Close' price column
close_prices = data['Close']

# Convert the Series to a DataFrame
rate_df = pd.DataFrame(close_prices)

# Rename the column to 'rate'
rate_df.columns = ['rate']

[*********************100%***********************]  1 of 1 completed


In [7]:
rate_df.head()

Unnamed: 0_level_0,rate
Date,Unnamed: 1_level_1
2020-01-02,1.882
2020-01-03,1.788
2020-01-06,1.811
2020-01-07,1.827
2020-01-08,1.874


## Generate a list for the stock tickers

In [8]:
tickers = ['AAPL', 'MSFT', 'GOOGL', 'GOOG', 'AMZN', 'JNJ', 'TSLA',
'JPM', 'NVDA', 'V', 'BAC', 'PG', 'HD', 'MA', 'UNH', 'DIS', 'PYPL',
'XOM', 'VZ', 'INTC', 'KO', 'WMT', 'CVX', 'ADBE', 'PFE', 'MRK', 'CRM',
'ORCL', 'CSCO', 'NFLX', 'BMY', 'T', 'BA', 'PEP', 'ABBV', 'COST', 'MDT',
'ABT', 'NKE', 'PM', 'HON', 'LMT', 'UPS', 'IBM', 'FDX', 'MMM', 'META',
'AAP', 'NIO', 'RKT', 'ABB', 'ABN.AS', 'ADP', 'ADS', 'AIG', 'ALGN', 'ALL',
'AMGN', 'AON', 'APA', 'APD', 'APH', 'ATVI', 'AVGO', 'AXP', 'AZN',
'BAES.L', 'BAX', 'BDX', 'BHP', 'BIIB', 'BLK', 'BNP.PA', 'BRK-A', 'BSX',
'BTI', 'BUD', 'CAT', 'CCI', 'CL', 'CLX', 'CME', 'COF', 'COP', 'CPNG',
'CRH', 'CS.PA', 'CSGP', 'CSX', 'CTSH', 'DAL', 'DHR', 'DIA', 'DOW', 'DUK',
'DVA', 'DXCM', 'EA', 'EBAY', 'EC', 'EMR', 'ENB', 'EOG', 'EQNR', 'EQIX',
'ETN', 'ETR', 'EW', 'EXC', 'EXPD', 'F', 'FB', 'FIS', 'FISV', 'FNV', 'GD',
'GE', 'GIS', 'GLD', 'GLW', 'GM', 'GNRC', 'GOLD', 'GOOS.TO', 'GPC', 'GRMN',
'GS', 'GSK', 'HAL', 'HEI.DE', 'HEI.F', 'HELE', 'HFC', 'HPQ', 'HSY', 'HUM',
'ICE', 'IDXX', 'IEF', 'INTU', 'ISRG', 'ITW', 'IVV', 'IXIC', 'JCI', 'KHC',
'KMI', 'KR', 'KSS', 'L', 'LBRDK', 'LEG', 'LH', 'LIN', 'LULU', 'LUV',
'LVS', 'LYB', 'MCD', 'MDLZ', 'MKC', 'MO', 'MS', 'MSCI', 'MSI', 'MTD',
'MU', 'NEE', 'NG.L', 'NOC', 'OBLN', 'OKE', 'OMC', 'ON', 'ORAN.PA',
'ORLY', 'OTIS', 'OXY', 'PAYC', 'PAYX', 'PBR', 'PCAR', 'PGR', 'PH', 'PPG',
'PPL', 'PRU', 'PSA', 'PSX', 'PXD', 'QCOM', 'QRVO', 'RACE', 'RDS-A',
'REGN', 'RIO', 'RJF', 'RL', 'ROK', 'ROST', 'RTX', 'RY.TO', 'SAP', 'SBAC',
'SBUX', 'SCHW', 'SFTBY', 'SHW', 'SLB', 'SNA', 'SNAP', 'SO', 'SPG', 'SPY',
'SRE', 'SRY.TO', 'STT', 'STZ', 'SU.TO', 'SWK', 'SWKS', 'SYK', 'SYNH',
'TAP', 'TD.TO', 'TEAM', 'TFC', 'TGT', 'TMO', 'TMUS', 'TOT', 'TPR',
'TRP.TO', 'TRV', 'TSCO', 'TSN', 'TT', 'TXN', 'UNP', 'USB', 'UTX', 'VALE',
'VFC', 'VLO', 'VMC', 'VOD', 'VRSK', 'VRSN', 'VRTX', 'VTR', 'WBA', 'WDC',
'WEC', 'WFC', 'WHR', 'WMU.DE', 'WPM', 'WSM', 'WY', 'XEL', 'XLE', 'XLF',
'XLI', 'XLK', 'XLP', 'XLRE', 'XLU', 'XLV', 'XRT', 'XYL', 'YUM', 'ZM',
'ZTS', 'ECL', 'SNOW', 'ACA.PA', 'ACN', 'ADI', 'AEP', 'AES', 'AFL', 'AGN',
'AJG', 'ALXN', 'AMAT', 'AMD', 'AMP', 'AMT', 'ANTM', 'APTV', 'ARE',
'ASML', 'ATO', 'AVB', 'AVY', 'AWK', 'BABA', 'BK', 'BKNG', 'BLL', 'BR',
'BRK.B', 'C', 'CAG', 'CAH', 'CB', 'CBOE', 'CCL', 'CDNS', 'CDW', 'CE',
'CERN', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CMA', 'CMCSA',
'CMG', 'CMI', 'CMS', 'CNC', 'CNP', 'COG', 'COO', 'COTY', 'CPB', 'CPRI',
'CPRT', 'CTAS', 'CTLT', 'CTXS', 'CVS', 'D', 'DD', 'DE', 'DFS', 'DG',
'DGX', 'DHI', 'DISCA', 'DISCK', 'DLR', 'DLTR', 'DOV', 'DRE', 'DRI',
'DTE', 'DVN', 'DXC', 'ED', 'EFX', 'EIX', 'EL', 'EMN', 'EQR', 'ES',
'ETFC', 'EVRG', 'EXPE', 'FANG', 'FAST', 'FBHS', 'FCX', 'FE', 'FFIV',
'FITB', 'FLIR', 'FLS', 'FLT', 'FMC', 'FOX', 'FOXA', 'FRC', 'FRT', 'FTI',
'FTNT', 'FTV', 'GGP', 'GILD', 'GL', 'GNL', 'GNTX', 'GPN', 'GPS', 'GT',
'GWW', 'HAS', 'HBAN', 'HBI', 'HCA', 'HES', 'HIG', 'HII', 'HLT', 'HOG',
'HRL', 'HSBC']

In [9]:
# Print the number of tickers
len(tickers)

401

## Download the historical data for the stock tickers

In [None]:
# Get Stock Data for all tickers
stocks_df = pd.DataFrame()
count = 0
for ticker in tickers:
    try:
        # Get the historical data for the ticker
        ticker_obj = yf.Ticker(ticker)
        history = ticker_obj.history(start=start_date, end=end_date)

        # Extract the 'Close' price column
        stock = pd.DataFrame(history['Close'])

        # Create a column name for the ticker
        stock['Ticker'] = ticker

        # Concate the dataframes
        stocks_df = pd.concat([stocks_df, stock], axis=0)

    except:
        print(f"Error with {ticker}")
    count += 1

In [11]:
stocks_df.head()

Unnamed: 0_level_0,Close,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02 00:00:00-05:00,73.449409,AAPL
2020-01-03 00:00:00-05:00,72.735321,AAPL
2020-01-06 00:00:00-05:00,73.31488,AAPL
2020-01-07 00:00:00-05:00,72.970093,AAPL
2020-01-08 00:00:00-05:00,74.143913,AAPL


In [12]:
stocks_df

Unnamed: 0_level_0,Close,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02 00:00:00-05:00,73.449409,AAPL
2020-01-03 00:00:00-05:00,72.735321,AAPL
2020-01-06 00:00:00-05:00,73.314880,AAPL
2020-01-07 00:00:00-05:00,72.970093,AAPL
2020-01-08 00:00:00-05:00,74.143913,AAPL
...,...,...
2023-04-10 00:00:00-04:00,35.130001,HSBC
2023-04-11 00:00:00-04:00,34.990002,HSBC
2023-04-12 00:00:00-04:00,35.090000,HSBC
2023-04-13 00:00:00-04:00,35.669998,HSBC


## Download the historical data for the dividend yield of the stock tickers

In [None]:
# Get Stock Data for all tickers
dividends_df = pd.DataFrame()
count = 0
for ticker in tickers:
    try:
        # Get the dividends data for the ticker
        dividends = pd.DataFrame(yf.Ticker(ticker).dividends)

        # Create a column name for the ticker
        dividends['Ticker'] = ticker

        # Concate the dataframes
        dividends_df = pd.concat([dividends_df, dividends], axis=0)
    except:
        print(f"Error with {ticker}")
    count += 1

In [14]:
dividends_df

Unnamed: 0,Dividends,Ticker
1987-05-11 00:00:00-04:00,0.000536,AAPL
1987-08-10 00:00:00-04:00,0.000536,AAPL
1987-11-17 00:00:00-05:00,0.000714,AAPL
1988-02-12 00:00:00-05:00,0.000714,AAPL
1988-05-16 00:00:00-04:00,0.000714,AAPL
...,...,...
2021-03-11 00:00:00-05:00,0.750000,HSBC
2021-08-19 00:00:00-04:00,0.350000,HSBC
2022-03-10 00:00:00-05:00,0.900000,HSBC
2022-08-18 00:00:00-04:00,0.450000,HSBC


## Get the Option Chains for the stock tickers

In [None]:
# Get Options Chains for all tickers
options_df = pd.DataFrame()
count = 0
for ticker in tickers:
    try:
        # Get the option chain for the ticker
        option_chain = yf.Ticker(ticker).option_chain()

        # Concatenate the call and put options
        options = pd.concat([option_chain[0], option_chain[1]])

        # Concate the options for all tickers
        options_df = pd.concat([options_df, options])
    except:
        print(f"Error with {ticker}")
    count += 1

In [14]:
# Preview the data
options_df.head()

Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,impliedVolatility,inTheMoney,contractSize,currency
0,AAPL230421C00050000,2023-04-10 15:10:29+00:00,50.0,111.25,114.55,116.3,0.0,0.0,4.0,1173.0,4.187505,True,REGULAR,USD
1,AAPL230421C00060000,2023-04-05 13:40:06+00:00,60.0,104.95,104.65,106.2,0.0,0.0,10.0,187.0,3.582032,True,REGULAR,USD
2,AAPL230421C00065000,2023-04-13 17:41:13+00:00,65.0,99.83,99.75,101.1,0.0,0.0,1.0,114.0,3.316408,True,REGULAR,USD
3,AAPL230421C00070000,2023-04-05 13:45:39+00:00,70.0,94.7,94.8,96.2,0.0,0.0,2.0,217.0,3.207033,True,REGULAR,USD
4,AAPL230421C00075000,2023-04-05 13:32:00+00:00,75.0,89.65,89.2,91.0,0.0,0.0,1.0,118.0,3.513673,True,REGULAR,USD


In [15]:
options_df

Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,impliedVolatility,inTheMoney,contractSize,currency
0,AAPL230421C00050000,2023-04-10 15:10:29+00:00,50.0,111.25,114.55,116.30,0.0,0.000000,4.0,1173.0,4.187505,True,REGULAR,USD
1,AAPL230421C00060000,2023-04-05 13:40:06+00:00,60.0,104.95,104.65,106.20,0.0,0.000000,10.0,187.0,3.582032,True,REGULAR,USD
2,AAPL230421C00065000,2023-04-13 17:41:13+00:00,65.0,99.83,99.75,101.10,0.0,0.000000,1.0,114.0,3.316408,True,REGULAR,USD
3,AAPL230421C00070000,2023-04-05 13:45:39+00:00,70.0,94.70,94.80,96.20,0.0,0.000000,2.0,217.0,3.207033,True,REGULAR,USD
4,AAPL230421C00075000,2023-04-05 13:32:00+00:00,75.0,89.65,89.20,91.00,0.0,0.000000,1.0,118.0,3.513673,True,REGULAR,USD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,HSBC230421P00037000,2023-04-13 13:41:34+00:00,37.0,0.85,0.75,0.85,-0.8,-48.484848,41.0,251.0,0.236336,True,REGULAR,USD
14,HSBC230421P00038000,2023-03-28 19:30:28+00:00,38.0,4.61,1.35,2.75,0.0,0.000000,1.0,50.0,0.531255,True,REGULAR,USD
15,HSBC230421P00039000,2023-03-27 18:24:00+00:00,39.0,5.41,0.40,5.00,0.0,0.000000,4.0,0.0,1.827149,True,REGULAR,USD
16,HSBC230421P00040000,2023-04-11 18:34:36+00:00,40.0,5.00,2.00,6.00,0.0,0.000000,6.0,0.0,0.777346,True,REGULAR,USD


## Get Historical Option Prices Data

In [None]:
options_list = options_df['contractSymbol'].tolist()
options_prices_df = pd.DataFrame()
count = 0
for option in tqdm(options_list, desc='Fetching option data'):
    try:
        # Get the historical price data for the option
        option_obj = yf.Ticker(option)
        history = option_obj.history(period="max")

        # Extract the 'Close' price column
        history_df = pd.DataFrame(history['Close'])

        # Create a column name for the ticker
        history_df['Ticker'] = option

        # Concate the options for all tickers
        options_prices_df = pd.concat([options_prices_df, history_df])
    except:
        print(f"Error with {option}")
    count += 1

In [21]:
# Preview the data
options_prices_df

Unnamed: 0_level_0,Close,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-08-25 00:00:00-04:00,119.500000,AAPL230421C00050000
2022-08-26 00:00:00-04:00,117.349998,AAPL230421C00050000
2022-08-29 00:00:00-04:00,112.599998,AAPL230421C00050000
2022-09-01 00:00:00-04:00,107.599998,AAPL230421C00050000
2022-09-02 00:00:00-04:00,106.250000,AAPL230421C00050000
...,...,...
2023-04-03 00:00:00-04:00,5.200000,HSBC230421P00040000
2023-04-11 00:00:00-04:00,5.000000,HSBC230421P00040000
2023-03-09 00:00:00-05:00,5.000000,HSBC230421P00042000
2023-03-13 00:00:00-04:00,7.200000,HSBC230421P00042000


In [22]:
# Get the first and last dates for the options
first_date = options_prices_df.index.min()
last_date = options_prices_df.index.max()
# Print the Dates
first_date, last_date

(Timestamp('2021-10-04 00:00:00-0400', tz='America/New_York'),
 Timestamp('2023-04-14 00:00:00-0400', tz='America/New_York'))

## Export data to csv files in order to have backup

In [16]:
rate_df.to_csv('01_Raw_Data/rate.csv')
stocks_df.to_csv('01_Raw_Data/stocks.csv')
dividends_df.to_csv('01_Raw_Data/dividends.csv')
options_df.to_csv('01_Raw_Data/options.csv')
options_prices_df.to_csv('01_Raw_Data/options_prices.csv')