In [5]:
import yfinance as yf
import pandas as pd

# Define ticker
ticker = yf.Ticker("AAPL")  # Change to any ticker you want

# Fetch financial data
income_stmt = ticker.financials.T  # Transpose so years are rows
cash_flow = ticker.cashflow.T
balance_sheet = ticker.balance_sheet.T
info = ticker.info  # Dictionary of metadata

# Filter last 4 years
income_stmt = income_stmt.head(4)
cash_flow = cash_flow.head(4)
balance_sheet = balance_sheet.head(4)

# Extract relevant fields (safely)
income_fields = [
    'Total Revenue',
    'Operating Income',
    'Income Before Tax',
    'Income Tax Expense'
]
income_available = [col for col in income_fields if col in income_stmt.columns]
income_df = income_stmt[income_available].copy()

cashflow_fields = [
    'Operating Cash Flow',
    'Capital Expenditure'
]
cashflow_available = [col for col in cashflow_fields if col in cash_flow.columns]
cashflow_df = cash_flow[cashflow_available].copy()

balance_fields = [
    'Total Current Assets',
    'Total Current Liabilities',
    'Cash And Cash Equivalents'
]
balance_available = [col for col in balance_fields if col in balance_sheet.columns]
balance_df = balance_sheet[balance_available].copy()

# Stock info (Beta, Market Cap, etc.)
info_keys = ['beta', 'marketCap', 'sharesOutstanding', 'sector']
info_clean = {key: info[key] for key in info_keys if key in info}
info_df = pd.DataFrame.from_dict(info_clean, orient='index', columns=['Value'])

# Optional: round values for readability
income_df = income_df.round(0)
cashflow_df = cashflow_df.round(0)
balance_df = balance_df.round(0)

# Display
print("📄 Income Statement (Last 4 Years):\n", income_df, "\n")
print("💵 Cash Flow Statement (Last 4 Years):\n", cashflow_df, "\n")
print("📊 Balance Sheet (Last 4 Years):\n", balance_df, "\n")
print("📈 Stock Info:\n", info_df)

📄 Income Statement (Last 4 Years):
             Total Revenue  Operating Income
2024-09-30   3.910350e+11      1.232160e+11
2023-09-30   3.832850e+11      1.143010e+11
2022-09-30   3.943280e+11      1.194370e+11
2021-09-30   3.658170e+11      1.089490e+11 

💵 Cash Flow Statement (Last 4 Years):
             Operating Cash Flow  Capital Expenditure
2024-09-30         1.182540e+11        -9.447000e+09
2023-09-30         1.105430e+11        -1.095900e+10
2022-09-30         1.221510e+11        -1.070800e+10
2021-09-30         1.040380e+11        -1.108500e+10 

📊 Balance Sheet (Last 4 Years):
             Cash And Cash Equivalents
2024-09-30               2.994300e+10
2023-09-30               2.996500e+10
2022-09-30               2.364600e+10
2021-09-30               3.494000e+10 

📈 Stock Info:
                            Value
beta                       1.165
marketCap          3014019776512
sharesOutstanding    14840399872
sector                Technology


In [6]:
import yfinance as yf
import pandas as pd
from tqdm import tqdm

# --------- Config ---------
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']  # Replace with your 1000 tickers
YEARS = 4

# --------- Core Extraction Function ---------

def get_financial_data(ticker_symbol):
    try:
        ticker = yf.Ticker(ticker_symbol)
        
        # Download statements
        income_stmt = ticker.financials.T.head(YEARS)
        cash_flow = ticker.cashflow.T.head(YEARS)
        balance_sheet = ticker.balance_sheet.T.head(YEARS)
        info = ticker.info

        # Income Statement
        income_fields = ['Total Revenue', 'Operating Income', 'Income Before Tax', 'Income Tax Expense']
        income_df = income_stmt[[col for col in income_fields if col in income_stmt.columns]].copy()
        income_df['Ticker'] = ticker_symbol

        # Cash Flow Statement
        cashflow_fields = ['Operating Cash Flow', 'Capital Expenditure']
        cashflow_df = cash_flow[[col for col in cashflow_fields if col in cash_flow.columns]].copy()
        cashflow_df['Ticker'] = ticker_symbol

        # Balance Sheet
        balance_fields = ['Total Current Assets', 'Total Current Liabilities', 'Cash And Cash Equivalents']
        balance_df = balance_sheet[[col for col in balance_fields if col in balance_sheet.columns]].copy()
        balance_df['Ticker'] = ticker_symbol

        # Stock Info
        info_fields = ['beta', 'marketCap', 'sharesOutstanding', 'sector']
        info_clean = {key: info.get(key, None) for key in info_fields}
        info_clean['Ticker'] = ticker_symbol
        info_df = pd.DataFrame([info_clean])
        
        return income_df, cashflow_df, balance_df, info_df

    except Exception as e:
        print(f"❌ Error with {ticker_symbol}: {e}")
        return None, None, None, None

# --------- Loop Through Tickers ---------

all_income = []
all_cashflow = []
all_balance = []
all_info = []

for t in tqdm(TICKERS):
    income, cashflow, balance, info = get_financial_data(t)
    
    if income is not None:
        all_income.append(income)
        all_cashflow.append(cashflow)
        all_balance.append(balance)
        all_info.append(info)

# --------- Concatenate Results ---------

df_income = pd.concat(all_income).reset_index().rename(columns={'index': 'Date'})
df_cashflow = pd.concat(all_cashflow).reset_index().rename(columns={'index': 'Date'})
df_balance = pd.concat(all_balance).reset_index().rename(columns={'index': 'Date'})
df_info = pd.concat(all_info).reset_index(drop=True)

# Optional: round values for readability
df_income = df_income.round(0)
df_cashflow = df_cashflow.round(0)
df_balance = df_balance.round(0)

# --------- Done ---------
print("✅ Completed extraction.")

# If needed: Save to files
# df_income.to_csv('income.csv', index=False)
# df_cashflow.to_csv('cashflow.csv', index=False)
# df_balance.to_csv('balance.csv', index=False)
# df_info.to_csv('stock_info.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.25s/it]

✅ Completed extraction.





In [17]:
import yfinance as yf
import pandas as pd
from tqdm import tqdm

# --------- Config ---------
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']  # Replace with your 1000 tickers
YEARS = 4

# --------- Corrected Fields ---------
income_fields = ['Total Revenue', 'Operating Income', 'Pretax Income', 'Tax Provision']
cashflow_fields = ['Operating Cash Flow', 'Capital Expenditure', 'Free Cash Flow', 
                   'Depreciation And Amortization', 'Stock Based Compensation']
balance_fields = ['Current Assets', 'Current Liabilities', 'Cash And Cash Equivalents']

info_fields = ['beta', 'marketCap', 'sharesOutstanding', 'sector']

# --------- Logging lists ---------
missing_log = []

# --------- Core Extraction Function ---------

def get_financial_data(ticker_symbol):
    try:
        ticker = yf.Ticker(ticker_symbol)

        # Fetch financials
        income_stmt = ticker.financials.T.head(YEARS)
        cash_flow = ticker.cashflow.T.head(YEARS)
        balance_sheet = ticker.balance_sheet.T.head(YEARS)
        info = ticker.info

        # --- Income Statement ---
        for col in income_fields:
            if col not in income_stmt.columns:
                income_stmt[col] = pd.NA
                missing_log.append((ticker_symbol, 'Income Statement', col))
        income_df = income_stmt[income_fields].copy()
        income_df['Ticker'] = ticker_symbol

        # --- Cash Flow Statement ---
        for col in cashflow_fields:
            if col not in cash_flow.columns:
                cash_flow[col] = pd.NA
                missing_log.append((ticker_symbol, 'Cash Flow', col))
        cashflow_df = cash_flow[cashflow_fields].copy()
        cashflow_df['Ticker'] = ticker_symbol

        # --- Balance Sheet ---
        for col in balance_fields:
            if col not in balance_sheet.columns:
                balance_sheet[col] = pd.NA
                missing_log.append((ticker_symbol, 'Balance Sheet', col))
        balance_df = balance_sheet[balance_fields].copy()
        balance_df['Ticker'] = ticker_symbol

        # --- Info ---
        info_clean = {key: info.get(key, None) for key in info_fields}
        info_clean['Ticker'] = ticker_symbol
        info_df = pd.DataFrame([info_clean])

        return income_df, cashflow_df, balance_df, info_df

    except Exception as e:
        print(f"❌ Error with {ticker_symbol}: {e}")
        return None, None, None, None

# --------- Loop Through Tickers ---------

all_income = []
all_cashflow = []
all_balance = []
all_info = []

for t in tqdm(TICKERS):
    income, cashflow, balance, info = get_financial_data(t)
    if income is not None:
        all_income.append(income)
        all_cashflow.append(cashflow)
        all_balance.append(balance)
        all_info.append(info)

# --------- Concatenate Results ---------

df_income = pd.concat(all_income).reset_index().rename(columns={'index': 'Date'})
df_cashflow = pd.concat(all_cashflow).reset_index().rename(columns={'index': 'Date'})
df_balance = pd.concat(all_balance).reset_index().rename(columns={'index': 'Date'})
df_info = pd.concat(all_info).reset_index(drop=True)

# --------- Cleanup ---------
df_income['Date'] = pd.to_datetime(df_income['Date'])
df_cashflow['Date'] = pd.to_datetime(df_cashflow['Date'])
df_balance['Date'] = pd.to_datetime(df_balance['Date'])

df_income = df_income.round(0)
df_cashflow = df_cashflow.round(0)
df_balance = df_balance.round(0)



pd.DataFrame(missing_log, columns=['Ticker', 'Statement', 'Missing Column']).to_csv('missing_fields_log.csv', index=False)

print("✅ Done. Missing fields logged to 'missing_fields_log.csv'")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.02it/s]

✅ Done. Missing fields logged to 'missing_fields_log.csv'





In [18]:
df_income

Unnamed: 0,Date,Total Revenue,Operating Income,Pretax Income,Tax Provision,Ticker
0,2024-09-30,391035000000.0,123216000000.0,123485000000.0,29749000000.0,AAPL
1,2023-09-30,383285000000.0,114301000000.0,113736000000.0,16741000000.0,AAPL
2,2022-09-30,394328000000.0,119437000000.0,119103000000.0,19300000000.0,AAPL
3,2021-09-30,365817000000.0,108949000000.0,109207000000.0,14527000000.0,AAPL
4,2025-06-30,281724000000.0,128528000000.0,123627000000.0,21795000000.0,MSFT
5,2024-06-30,245122000000.0,109433000000.0,107787000000.0,19651000000.0,MSFT
6,2023-06-30,211915000000.0,88523000000.0,89311000000.0,16950000000.0,MSFT
7,2022-06-30,198270000000.0,83383000000.0,83716000000.0,10978000000.0,MSFT
8,2024-12-31,350018000000.0,112390000000.0,119815000000.0,19697000000.0,GOOGL
9,2023-12-31,307394000000.0,84293000000.0,85717000000.0,11922000000.0,GOOGL


In [19]:
df_cashflow

Unnamed: 0,Date,Operating Cash Flow,Capital Expenditure,Free Cash Flow,Depreciation And Amortization,Stock Based Compensation,Ticker
0,2024-09-30,118254000000.0,-9447000000.0,108807000000.0,11445000000.0,11688000000.0,AAPL
1,2023-09-30,110543000000.0,-10959000000.0,99584000000.0,11519000000.0,10833000000.0,AAPL
2,2022-09-30,122151000000.0,-10708000000.0,111443000000.0,11104000000.0,9038000000.0,AAPL
3,2021-09-30,104038000000.0,-11085000000.0,92953000000.0,11284000000.0,7906000000.0,AAPL
4,2025-06-30,136162000000.0,-64551000000.0,71611000000.0,34153000000.0,11974000000.0,MSFT
5,2024-06-30,118548000000.0,-44477000000.0,74071000000.0,22287000000.0,10734000000.0,MSFT
6,2023-06-30,87582000000.0,-28107000000.0,59475000000.0,13861000000.0,9611000000.0,MSFT
7,2022-06-30,89035000000.0,-23886000000.0,65149000000.0,14460000000.0,7502000000.0,MSFT
8,2024-12-31,125299000000.0,-52535000000.0,72764000000.0,15311000000.0,22785000000.0,GOOGL
9,2023-12-31,101746000000.0,-32251000000.0,69495000000.0,11946000000.0,22460000000.0,GOOGL


In [20]:
df_balance

Unnamed: 0,Date,Current Assets,Current Liabilities,Cash And Cash Equivalents,Ticker
0,2024-09-30,152987000000.0,176392000000.0,29943000000.0,AAPL
1,2023-09-30,143566000000.0,145308000000.0,29965000000.0,AAPL
2,2022-09-30,135405000000.0,153982000000.0,23646000000.0,AAPL
3,2021-09-30,134836000000.0,125481000000.0,34940000000.0,AAPL
4,2025-06-30,191131000000.0,141218000000.0,30242000000.0,MSFT
5,2024-06-30,159734000000.0,125286000000.0,18315000000.0,MSFT
6,2023-06-30,184257000000.0,104149000000.0,34704000000.0,MSFT
7,2022-06-30,169684000000.0,95082000000.0,13931000000.0,MSFT
8,2024-12-31,163711000000.0,89122000000.0,23466000000.0,GOOGL
9,2023-12-31,171530000000.0,81814000000.0,24048000000.0,GOOGL


In [None]:
# caluclate present FCFF, Use 4 year average to find growth rate for next three years
# Terminal value calcualtion after 3 years, I need to find the WACC? Estimate or ML? First estimate, ML as an add on later.