# Collect Data

This notebook uses a list of tickers to create `./data/features.csv` and `./data/labels.csv` from data found on Yahoo Finance and Zacks.com.

## Installation

In [45]:
#%pip install --upgrade numpy
#%pip install --upgrade pandas
#%pip install --upgrade yfinance

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf

In [47]:
TICKERS_FILE = './data/adr_tickers.csv'
FEATURES_FILE = './data/adr_features.csv'
LABELS_FILE = './data/adr_labels.csv'

#TICKERS_FILE = './data/us_tickers.txt'
#FEATURES_FILE = './data/us_features.csv'
#LABELS_FILE = './data/us_labels.csv'

## Tickers

Source: https://github.com/rreichel3/US-Stock-Symbols

In [48]:
df = pd.read_csv(TICKERS_FILE, header=None, dtype=str, keep_default_na=False)
#for index, row in df.iterrows():
#    if type(row[0]) is not str:
#        print(index, row[0])
t = df[0].to_list()
t.sort()
len(t)

463

## Features

Collect feature data from Yahoo Finance, using:

- https://github.com/ranaroussi/yfinance

In [6]:
ticker = yf.Ticker('MSFT')
#ticker.actions
#ticker.dividends
#ticher.history(period='5y')
ticker.financials

Unnamed: 0,2023-06-30,2022-06-30,2021-06-30,2020-06-30
Tax Effect Of Unusual Items,0.0,0.0,0.0,0.0
Tax Rate For Calcs,0.189786,0.131134,0.138266,0.16
Normalized EBITDA,102384000000.0,97843000000.0,81602000000.0,65755000000.0
Total Unusual Items,,334000000.0,1303000000.0,28000000.0
Total Unusual Items Excluding Goodwill,,334000000.0,1303000000.0,28000000.0
Net Income From Continuing Operation Net Minority Interest,72361000000.0,72738000000.0,61271000000.0,44281000000.0
Reconciled Depreciation,13861000000.0,14460000000.0,11686000000.0,12796000000.0
Reconciled Cost Of Revenue,65863000000.0,62650000000.0,52232000000.0,46078000000.0
EBIT,88523000000.0,83383000000.0,69916000000.0,52959000000.0
Net Interest Income,,31000000.0,-215000000.0,89000000.0


Most relevant features (columns) are listed at:

- https://yourfinancebook.com/stock-market-financial-metrics/

In [50]:
columns = [
    'shortName',
    'sector',
    'industry',
    # Stock metrics
    'recommendationMean',
    'numberOfAnalystOpinions',
    'overallRisk',
    'beta',
    # Earnings per Share (EPS)
    'previousClose',
    'trailingEps',
    'forwardEps',
    # Price to earnings ratio (P/E)
    'trailingPE',
    'forwardPE',
    #'trailingPegRatio',
    'pegRatio',
    # Price to book value ratio (P/B)
    'bookValue',
    'priceToBook',
    # Enterprise value
    'enterpriseToRevenue',
    'enterpriseToEbitda',
    # Profit margin
    'ebitdaMargins',
    'profitMargins',
    'earningsGrowth',
    'revenueGrowth',
    # Dividend payout ratio and Dividend yield
    # 'dividendRate',
    'dividendYield',
    'payoutRatio',
    'fiveYearAvgDividendYield',
    # Price to free cash flow ratio (lower is better - below 10)
    'enterpriseValue',
    'freeCashflow',
    # Debt to equity ratio (DE)
    'debtToEquity',
    # Return on assets (ROA)
    'returnOnAssets',
    # Return on equity (ROE)
    'returnOnEquity',
    # Quick and current ratios
    'quickRatio',
    'currentRatio',
]

features = pd.DataFrame(index = t, columns=columns)

for idx in features.index:
    ticker = yf.Ticker(idx)
    try:
        info = ticker.info
        for c in columns: 
            try:
                features.loc[idx, [c]] = info[c]
            except Exception as e:
                print (idx, e) # This yields a NaN
    except Exception as e:
        print (idx, e)
features

AACG 'overallRisk'
AACG 'trailingPE'
AACG 'earningsGrowth'
AACG 'dividendYield'
AACG 'fiveYearAvgDividendYield'
AACG 'freeCashflow'
ABB 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v6/finance/quoteSummary/ABB?modules=financialData&modules=quoteType&modules=defaultKeyStatistics&modules=assetProfile&modules=summaryDetail&ssl=true
ABCM 'overallRisk'
ABCM 'forwardEps'
ABCM 'trailingPE'
ABCM 'forwardPE'
ABCM 'pegRatio'
ABCM 'earningsGrowth'
ABCM 'dividendYield'
ABCM 'fiveYearAvgDividendYield'
ACH 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v6/finance/quoteSummary/ACH?modules=financialData&modules=quoteType&modules=defaultKeyStatistics&modules=assetProfile&modules=summaryDetail&ssl=true
ADAG 'overallRisk'
ADAG 'trailingPE'
ADAG 'pegRatio'
ADAG 'earningsGrowth'
ADAG 'dividendYield'
ADAG 'fiveYearAvgDividendYield'
ADAP 'overallRisk'
ADAP 'trailingPE'
ADAP 'earningsGrowth'
ADAP 'dividendYield'
ADAP 'fiveYearAvgDividendYield'
ADXN 'recommendatio

Unnamed: 0,shortName,sector,industry,recommendationMean,numberOfAnalystOpinions,overallRisk,beta,previousClose,trailingEps,forwardEps,...,dividendYield,payoutRatio,fiveYearAvgDividendYield,enterpriseValue,freeCashflow,debtToEquity,returnOnAssets,returnOnEquity,quickRatio,currentRatio
AACG,ATA Creativity Global,Consumer Defensive,Education & Training Services,2.0,1,,1.279503,1.21,-0.22,-0.99,...,,0.0,,14553503,,30.286,-0.07129,-0.341,0.175,0.205
ABB,,,,,,,,,,,...,,,,,,,,,,
ABCM,Abcam plc,Healthcare,Biotechnology,2.0,5,,0.923542,21.54,-0.05,,...,,0.0,,5068363264,31137500,30.802,0.03268,-0.01229,0.889,1.266
ABEV,Ambev S.A.,Consumer Defensive,Beverages—Brewers,2.2,7,5,0.617467,2.84,0.19,0.19,...,0.0508,0.8492,3.3,37677678592,10596658176,4.743,0.08271,0.16919,0.607,1.006
ACH,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZH,Zhihu Inc.,Communication Services,Internet Content & Information,2.0,10,,0.418161,1.1,-0.27,-0.07,...,,0.0,,-5439845888,,1.704,-0.09152,-0.19223,3.624,3.735
ZLAB,Zai Lab Limited,Healthcare,Biotechnology,1.7,12,,1.087287,24.14,-4.32,-2.98,...,,0.0,,1545148032,-272515264,2.044,-0.19548,-0.36652,7.069,7.63
ZME,,,,,,,,,,,...,,,,,,,,,,
ZNH,,,,,,,,,,,...,,,,,,,,,,


In [51]:
features.to_csv(FEATURES_FILE, index=True, index_label='ticker')

## Labels

Collect label data from:

- https://www.bestquants.com/2019/11/how-to-web-scrapping-zacks-strong-buy.html

Consider using, to retry requests if necessary:

- https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.Retry

In [52]:
import requests
from requests.exceptions import HTTPError

def zacks(ticker: str):
    json = { ticker:  None }    
    try:
        response = requests.get('https://quote-feed.zacks.com/index?t=' + ticker)
        response.raise_for_status()
        json = response.json()

    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')

    return json

In [53]:
zacks('MSFT')

{'MSFT': {'source': {'sungard': {'bidasksize': '100x100',
    'dividend_freq': '4',
    'prev_close_date': '08/17/2023 11:01:04',
    'zacks_recommendation': '',
    'timestamp': '15:59',
    'exchange': 'NASDAQ',
    'shares': '',
    'volatility': '.9',
    'pos_size': '100',
    'open': '320.54',
    'yrlow': '213.431',
    'type': 'S',
    'yield': '.85',
    'market_cap': '2372026365886',
    'ask': '319.28',
    'dividend': '.68',
    'dividend_date': '09/14/2023 00:00:00',
    'earnings': '9.81',
    'close': '320.4',
    'day_low': '318.7001',
    'last_trade_datetime': '08/17/2023 11:01:04',
    'volume': '20698864',
    'yrhigh': '366.78',
    'day_high': '321.87',
    'bid': '319.25',
    'name': 'Microsoft Corporation',
    'pe_ratio': '29.39',
    'updated': '08/17/2023 11:01:04'},
   'bats': {'ask_size': '2',
    'routed': '19063',
    'last_trade_datetime': '08/17/2023 11:18:01',
    'matched': '212784',
    'bid_size': '50',
    'net_pct_change': 'NULL',
    'updated': 

In [54]:
labels = pd.DataFrame(index = t, columns=['zacks_rank'])
for idx in labels.index:
    z = zacks(idx)
    try:
        labels.loc[idx, ['zacks_rank']] = z[idx]['zacks_rank']
    except Exception as e:
        print (idx, e)
labels

AMOV 'zacks_rank'
AMYT 'zacks_rank'
ANPC 'zacks_rank'
BLCT 'zacks_rank'
CIG.C 'zacks_rank'
CIH 'zacks_rank'
DIDI 'zacks_rank'
FHS 'zacks_rank'
GET 'zacks_rank'
GSK# 'GSK#'
HLN# 'HLN#'
IONR 'zacks_rank'
MITC 'zacks_rank'
MOHO 'zacks_rank'
NWG# 'NWG#'
OG 'zacks_rank'
QK 'zacks_rank'
Symbol 'Symbol'


Unnamed: 0,zacks_rank
AACG,
ABB,3
ABCM,2
ABEV,2
ACH,
...,...
ZH,3
ZLAB,3
ZME,
ZNH,


In [55]:
labels.to_csv(LABELS_FILE, index=True, index_label='ticker')