# Collect Data

This notebook uses a list of tickers to create `./data/features.csv` and `./data/labels.csv` from data found on Yahoo Finance and Zacks.com.

## Installation

In [6]:
#%pip install --upgrade numpy
#%pip install --upgrade pandas
#%pip install --upgrade yfinance

In [2]:
import numpy as np
import pandas as pd
import yfinance as yf

## Tickers

Source: https://github.com/rreichel3/US-Stock-Symbols

In [20]:
df = pd.read_csv('./data/all_tickers.txt', header=None, dtype=str, keep_default_na=False)
#for index, row in df.iterrows():
#    if type(row[0]) is not str:
#        print(index, row[0])
t = df[0].to_list()
t.sort()
t

['A',
 'AA',
 'AAC',
 'AACG',
 'AACI',
 'AACIU',
 'AACIW',
 'AACT',
 'AADI',
 'AAIC',
 'AAIN',
 'AAL',
 'AAM',
 'AAMC',
 'AAME',
 'AAN',
 'AAOI',
 'AAON',
 'AAP',
 'AAPL',
 'AAT',
 'AAU',
 'AB',
 'ABBV',
 'ABC',
 'ABCB',
 'ABCL',
 'ABCM',
 'ABEO',
 'ABEV',
 'ABG',
 'ABIO',
 'ABL',
 'ABLLW',
 'ABM',
 'ABNB',
 'ABOS',
 'ABR',
 'ABSI',
 'ABT',
 'ABUS',
 'ABVC',
 'AC',
 'ACA',
 'ACABW',
 'ACAC',
 'ACAD',
 'ACAH',
 'ACAHU',
 'ACAHW',
 'ACAQ',
 'ACB',
 'ACBA',
 'ACBAU',
 'ACBAW',
 'ACCD',
 'ACCO',
 'ACDC',
 'ACDCW',
 'ACEL',
 'ACER',
 'ACET',
 'ACGL',
 'ACGLN',
 'ACGLO',
 'ACGN',
 'ACHC',
 'ACHL',
 'ACHR',
 'ACHV',
 'ACI',
 'ACIU',
 'ACIW',
 'ACLS',
 'ACLX',
 'ACM',
 'ACMR',
 'ACN',
 'ACNB',
 'ACNT',
 'ACON',
 'ACONW',
 'ACOR',
 'ACP',
 'ACR',
 'ACRE',
 'ACRO',
 'ACRS',
 'ACRV',
 'ACRX',
 'ACST',
 'ACT',
 'ACTG',
 'ACU',
 'ACV',
 'ACVA',
 'ACXP',
 'ADAG',
 'ADAP',
 'ADBE',
 'ADC',
 'ADCT',
 'ADD',
 'ADEA',
 'ADER',
 'ADERW',
 'ADES',
 'ADEX',
 'ADI',
 'ADIL',
 'ADM',
 'ADMA',
 'ADMP',
 'ADN'

## Features

Collect feature data from Yahoo Finance, using:

- https://github.com/ranaroussi/yfinance

In [21]:
ticker = yf.Ticker('MSFT')
#ticker.actions
#ticker.dividends
#ticher.history(period='5y')
ticker.info

{'address1': 'One Microsoft Way',
 'city': 'Redmond',
 'state': 'WA',
 'zip': '98052-6399',
 'country': 'United States',
 'phone': '425 882 8080',
 'website': 'https://www.microsoft.com',
 'industry': 'Software—Infrastructure',
 'industryDisp': 'Software—Infrastructure',
 'sector': 'Technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Microsoft Corporation develops and supports software, services, devices and solutions worldwide. The Productivity and Business Processes segment offers office, exchange, SharePoint, Microsoft Teams, office 365 Security and Compliance, Microsoft viva, and Microsoft 365 copilot; and office consumer services, such as Microsoft 365 consumer subscriptions, Office licensed on-premises, and other office services. This segment also provides LinkedIn; and dynamics business solutions, including Dynamics 365, a set of intelligent, cloud-based applications across ERP, CRM, power apps, and power automate; and on-premises ERP and CRM applications. The Int

Most relevant features (columns) are listed at:

- https://yourfinancebook.com/stock-market-financial-metrics/

In [29]:
columns = [
    'shortName',
    'sector',
    'industry',
    # Stock metrics
    'recommendationMean',
    'overallRisk',
    'beta',
    # Earnings per Share (EPS)
    'trailingEps',
    'forwardEps',
    # Price to earnings ratio (P/E)
    'trailingPE',
    'forwardPE',
    #'trailingPegRatio',
    'pegRatio',
    # Price to book value ratio (P/B)
    'bookValue',
    'priceToBook',
    # Profit margin
    'ebitdaMargins',
    'profitMargins',
    'earningsGrowth',
    'revenueGrowth',
    # Dividend payout ratio and Dividend yield
    # 'dividendRate',
    'dividendYield',
    'payoutRatio',
    'fiveYearAvgDividendYield',
    # Price to free cash flow ratio
    # ?
    # Debt to equity ratio (DE)
    'debtToEquity',
    # Return on assets (ROA)
    'returnOnAssets',
    # Return on equity (ROE)
    'returnOnEquity',
    # Quick and current ratios
    'quickRatio',
    'currentRatio',
]

features = pd.DataFrame(index = t, columns=columns)

for idx in features.index:
    ticker = yf.Ticker(idx)
    try:
        info = ticker.info
        for c in columns: 
            try:
                features.loc[idx, [c]] = info[c]
            except Exception as e:
                print (idx, e) # This yields a NaN
    except Exception as e:
        print (idx, e)
features

AA 'trailingPE'
AA 'earningsGrowth'
AA 'payoutRatio'
AA 'fiveYearAvgDividendYield'
AAC 'recommendationMean'
AAC 'overallRisk'
AAC 'trailingPE'
AAC 'pegRatio'
AAC 'priceToBook'
AAC 'earningsGrowth'
AAC 'revenueGrowth'
AAC 'dividendYield'
AAC 'fiveYearAvgDividendYield'
AAC 'debtToEquity'
AAC 'returnOnEquity'
AACG 'overallRisk'
AACG 'trailingPE'
AACG 'earningsGrowth'
AACG 'dividendYield'
AACG 'fiveYearAvgDividendYield'
AACI 'recommendationMean'
AACI 'overallRisk'
AACI 'beta'
AACI 'forwardEps'
AACI 'trailingPE'
AACI 'forwardPE'
AACI 'pegRatio'
AACI 'priceToBook'
AACI 'earningsGrowth'
AACI 'revenueGrowth'
AACI 'dividendYield'
AACI 'fiveYearAvgDividendYield'
AACI 'debtToEquity'
AACI 'returnOnEquity'
AACIU 'recommendationMean'
AACIU 'overallRisk'
AACIU 'beta'
AACIU 'forwardEps'
AACIU 'trailingPE'
AACIU 'forwardPE'
AACIU 'pegRatio'
AACIU 'priceToBook'
AACIU 'earningsGrowth'
AACIU 'revenueGrowth'
AACIU 'dividendYield'
AACIU 'payoutRatio'
AACIU 'fiveYearAvgDividendYield'
AACIU 'debtToEquity'
AAC

Unnamed: 0,shortName,sector,industry,recommendationMean,overallRisk,beta,trailingEps,forwardEps,trailingPE,forwardPE,...,earningsGrowth,revenueGrowth,dividendYield,payoutRatio,fiveYearAvgDividendYield,debtToEquity,returnOnAssets,returnOnEquity,quickRatio,currentRatio
A,"Agilent Technologies, Inc.",Healthcare,Diagnostics & Research,2.2,8,1.026324,4.53,5.98,28.333334,21.463211,...,0.121,0.068,0.0071,0.1916,0.7,50.009,0.10401,0.24782,1.546,2.37
AA,Alcoa Corporation,Basic Materials,Aluminum,2.6,1,2.442674,-8.55,3.18,,10.449685,...,,-0.263,0.0118,,,28.399,-0.01106,-0.22429,0.692,1.799
AAC,Ares Acquisition Corporation,Financial Services,Shell Companies,,,0.013452,-0.07,-1.01,,-10.524753,...,,,,0.0,,,-0.00911,,0.006,0.009
AACG,ATA Creativity Global,Consumer Defensive,Education & Training Services,2.0,,1.072739,-0.22,-0.99,,-1.232121,...,,0.004,,0.0,,26.451,-0.07631,-0.33159,0.209,0.231
AACI,Armada Acquisition Corp. I,Financial Services,Shell Companies,,,,-0.59,,,,...,,,,0.0,,,-0.02213,,0.033,0.033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZVSA,"ZyVersa Therapeutics, Inc.",Healthcare,Biotechnology,,,,-2.19,,,,...,,,,0.0,,0.087,-0.13853,-0.33597,0.149,0.33
ZWS,Zurn Elkay Water Solutions Corp,Industrials,Pollution & Treatment Controls,1.9,3,1.11673,0.27,1.12,110.14814,26.553572,...,-0.291,0.419,0.0094,1.6471,,38.134,0.05188,0.05138,1.511,2.976
ZYME,Zymeworks Inc.,Healthcare,Biotechnology,1.9,,0.941461,2.57,-1.33,2.894942,-5.593985,...,,17.569,,0.0,,5.774,0.21186,0.45666,6.056,6.334
ZYNE,"Zynerba Pharmaceuticals, Inc.",Healthcare,Drug Manufacturers—Specialty & Generic,1.7,,1.76169,-0.81,-0.61,,-0.579836,...,,,,0.0,,0.709,-0.37412,-0.70408,4.742,4.951


In [30]:
features.to_csv('./data/features.csv', index=True, index_label='ticker')

## Labels

Collect label data from:

- https://www.bestquants.com/2019/11/how-to-web-scrapping-zacks-strong-buy.html

Consider using, to retry requests if necessary:

- https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.Retry

In [1]:
import requests
from requests.exceptions import HTTPError

def zacks(ticker: str):
    json = { ticker:  None }    
    try:
        response = requests.get('https://quote-feed.zacks.com/index?t=' + ticker)
        response.raise_for_status()
        json = response.json()

    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')

    return json

In [2]:
zacks('MSFT')

{'MSFT': {'source': {'sungard': {'bidasksize': '100x200',
    'dividend_freq': '4',
    'prev_close_date': '08/09/2023 16:00:00',
    'zacks_recommendation': '',
    'timestamp': '15:59',
    'exchange': 'NASDAQ',
    'shares': '',
    'volatility': '.9',
    'pos_size': '3058486',
    'open': '326.47',
    'yrlow': '213.431',
    'type': 'S',
    'yield': '.83',
    'market_cap': '2394092764140',
    'ask': '322.94',
    'dividend': '.68',
    'dividend_date': '09/14/2023 00:00:00',
    'earnings': '9.81',
    'close': '326.05',
    'day_low': '321.0499',
    'last_trade_datetime': '08/09/2023 16:00:00',
    'volume': '22327574',
    'yrhigh': '366.78',
    'day_high': '327.11',
    'bid': '322.4',
    'name': 'Microsoft Corporation',
    'pe_ratio': '30.06',
    'updated': '08/09/2023 16:00:00'},
   'bats': {'ask_size': '7',
    'routed': '29152',
    'last_trade_datetime': '08/09/2023 15:58:02',
    'matched': '806109',
    'bid_size': '100',
    'net_pct_change': 'NULL',
    'updat

In [27]:
labels = pd.DataFrame(index = t, columns=['zacks_rank'])
for idx in labels.index:
    z = zacks(idx)
    try:
        labels.loc[idx, ['zacks_rank']] = z[idx]['zacks_rank']
    except Exception as e:
        print (idx, e)
labels

AACIU 'zacks_rank'
AACIW 'zacks_rank'
AAIN 'zacks_rank'
AAM 'zacks_rank'
ABLLW 'zacks_rank'
ACABW 'zacks_rank'
ACAHU 'zacks_rank'
ACAHW 'zacks_rank'
ACBAU 'zacks_rank'
ACBAW 'zacks_rank'
ACDCW 'zacks_rank'
ACGLN 'zacks_rank'
ACGLO 'zacks_rank'
ACONW 'zacks_rank'
ADERW 'zacks_rank'
ADNWW 'zacks_rank'
ADOCR 'zacks_rank'
ADOCW 'zacks_rank'
ADSEW 'zacks_rank'
ADTHW 'zacks_rank'
ADVWW 'zacks_rank'
AEAEU 'zacks_rank'
AEAEW 'zacks_rank'
AEFC 'zacks_rank'
AENTW 'zacks_rank'
AEPPZ 'zacks_rank'
AESC 'zacks_rank'
AFARU 'zacks_rank'
AFGB 'zacks_rank'
AFGC 'zacks_rank'
AFGD 'zacks_rank'
AFGE 'zacks_rank'
AFRIW 'zacks_rank'
AGBAW 'zacks_rank'
AGNCL 'zacks_rank'
AGNCM 'zacks_rank'
AGNCN 'zacks_rank'
AGNCO 'zacks_rank'
AGNCP 'zacks_rank'
AGRIW 'zacks_rank'
AIC 'zacks_rank'
AIMBU 'zacks_rank'
AIMDW 'zacks_rank'
AIO 'zacks_rank'
AIRTP 'zacks_rank'
AIZN 'zacks_rank'
AJXA 'zacks_rank'
AKO 'zacks_rank'
ALCYU 'zacks_rank'
ALCYW 'zacks_rank'
ALSAR 'zacks_rank'
ALSAW 'zacks_rank'
ALTUW 'zacks_rank'
ALVOW 'zac

Unnamed: 0,zacks_rank
A,3
AA,3
AAC,
AACG,
AACI,
...,...
ZVSA,
ZWS,1
ZYME,3
ZYNE,3


In [28]:
labels.to_csv('./data/labels.csv', index=True, index_label='ticker')