#### Install dependencies

In [366]:
# Install yfinance package.
!pip install yfinance html5lib --q

#### Define list of stocks
TODO: create screener to feed this process.

In [420]:
# List of stock tickers you want to analyze
stock_list = ["AAPL", "BRK-B", "NVDA", "MSFT", "GOOGL", "AMZN", "V", "PLTR", "OKLO", "BABA", "BIDU", "QCOM", "JD"]

#### Define function to import metrics from a given stock list.

In [421]:
import yfinance as yf
import pandas as pd

def get_stock_financial_metrics(ticker_symbol):
    """
    Retrieves key financial metrics for a given stock ticker using yfinance.

    Args:
        ticker_symbol (str): The stock ticker symbol (e.g., "AAPL", "MSFT").

    Returns:
        dict: A dictionary containing the financial metrics.
              Returns None for metrics not available.
              Returns an error message string if the ticker is invalid or data cannot be fetched.
    """
    try:
        stock = yf.Ticker(ticker_symbol)
        info = stock.info

        # A more robust check for valid ticker data
        if not info or 'symbol' not in info or info.get('symbol', '').lower() != ticker_symbol.lower():
            # Check if it's a known "bad" ticker pattern from yfinance for delisted/problematic ones
            if info.get('regularMarketPrice') is None and info.get('logo_url') == '': # Common pattern for invalid tickers
                 return f"Could not retrieve valid data for ticker: {ticker_symbol}. It might be an invalid or delisted ticker."
            # If 'symbol' is present but doesn't match, it's odd, but let's flag it.
            # If 'symbol' is missing, it's definitely problematic.
            if 'symbol' not in info:
                return f"Could not retrieve valid data for ticker: {ticker_symbol}. Essential 'symbol' info missing."


        metrics = {
            "ticker": ticker_symbol, # Ensure ticker is always present
            "price": info.get('currentPrice', info.get('regularMarketPrice', info.get('previousClose'))),
            "pe_ratio": info.get('trailingPE', info.get('forwardPE')),
            "eps": info.get('trailingEps', info.get('forwardEps')),
            "roe": info.get('returnOnEquity'),
            "roa": info.get('returnOnAssets'),
            "profit_margin": info.get('profitMargins'), # Added profit margin
            "book_value_per_share": info.get('bookValue'),
            "shares_outstanding": info.get('sharesOutstanding'),
            "price_to_book": info.get('priceToBook'),
            "shortName": info.get('shortName') # Adding company name for clarity
        }
        return metrics

    except Exception as e:
        # For truly problematic tickers, yfinance might raise an exception before .info
        # or if .info itself is problematic (e.g., not a dict)
        return {
            "ticker": ticker_symbol,
            "price": None,
            "pe_ratio": None,
            "eps": None,
            "roe": None,
            "roa": None,
            "profit_margin": None, # Added profit margin
            "book_value_per_share": None,
            "shares_outstanding": None,
            "price_to_book": None,
            "shortName": f"Error: {str(e)}", # Store error in a field
            "error_message": str(e) # Explicit error message field
        }


def get_financials_for_stock_list(ticker_list):
    """
    Fetches financial metrics for a list of stock tickers and returns them as a Pandas DataFrame.

    Args:
        ticker_list (list): A list of stock ticker symbols (e.g., ["AAPL", "MSFT", "GOOGL"]).

    Returns:
        pandas.DataFrame: A DataFrame containing the financial metrics for each stock.
                          Includes an 'error_message' column for tickers where data couldn't be fetched.
    """
    all_metrics_data = []
    for ticker in ticker_list:
        #print(f"Fetching data for {ticker}...")
        data = get_stock_financial_metrics(ticker)
        
        # If the function returns a string (our old error handling), convert to dict
        if isinstance(data, str) and "Could not retrieve" in data: # Check for our specific error string
            metrics_dict = {
                "ticker": ticker, "price": None, "pe_ratio": None, "eps": None,
                "roe": None, "roa": None, "profit_margin": None, # Added profit margin
                "book_value_per_share": None, "shares_outstanding": None, 
                "price_to_book": None, "shortName": None,
                "error_message": data
            }
        elif isinstance(data, dict):
            metrics_dict = data
            if "error_message" not in metrics_dict: # Ensure error_message field exists
                 metrics_dict["error_message"] = None
        else: # Should not happen with current get_stock_financial_metrics
            metrics_dict = {
                "ticker": ticker, "price": None, "pe_ratio": None, "eps": None,
                "roe": None, "roa": None, "profit_margin": None, # Added profit margin
                "book_value_per_share": None, "shares_outstanding": None, 
                "price_to_book": None, "shortName": None,
                "error_message": "Unknown error structure from get_stock_financial_metrics"
            }
            
        all_metrics_data.append(metrics_dict)
        
    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame(all_metrics_data)
    
    # Reorder columns to have ticker and shortName first, and error_message last
    if not df.empty:
        cols = ["ticker", "shortName", "price", "pe_ratio", "eps", "roe", "roa", "profit_margin",
                "book_value_per_share", "shares_outstanding", "price_to_book", "error_message"]
        # Filter out columns not present in the DataFrame (e.g., if all tickers failed identically)
        existing_cols = [col for col in cols if col in df.columns]
        df = df[existing_cols]
        
    return df

#### Execute function with current stock list and store into df "successful_data_df"

In [422]:
# Execute conditional for each stock and return specified metrics.
if __name__ == "__main__":
    print("Starting financial data retrieval...")
    financials_df = get_financials_for_stock_list(stock_list)

    #print("\n--- Financial Data DataFrame ---")
    #print(financials_df)

    # Further analysis or saving the DataFrame
    if not financials_df.empty:
        #print("\n--- DataFrame Info ---")
        #financials_df.info()

        # Example: Filter out rows with errors for cleaner analysis
        successful_data_df = financials_df[financials_df['error_message'].isnull()].copy() # Use .copy() to avoid SettingWithCopyWarning
        
        # Convert relevant columns to numeric, coercing errors to NaN
        numeric_cols = ["price", "pe_ratio", "eps", "roe", "roa", "profit_margin", 
                        "book_value_per_share", "shares_outstanding", "price_to_book"]
        for col in numeric_cols:
            if col in successful_data_df.columns:
                successful_data_df[col] = pd.to_numeric(successful_data_df[col], errors='coerce')

Starting financial data retrieval...


---

#### Add alternative ticker for ValueInvesting.io website.

In [423]:
# Define alternative ticker mapping for ValueInvesting.io
stock_replacement_map = {
    'BRK-B': 'BRK.A',
    'JD': '9618.HK'
}

# Add alt ticker column based on mapping.
successful_data_df['alt_ticker'] = successful_data_df['ticker'].map(stock_replacement_map)

# Create new stock list to feed valueinvesting.io scraper function.
stock_list_vi = [stock_replacement_map.get(x,x) for x in stock_list]

In [424]:
import requests
from bs4 import BeautifulSoup
import pandas as pd # Import pandas for DataFrame
import numpy as np # For np.nan, though pd.NA is often preferred
from tabulate import tabulate # Import tabulate for pretty printing DataFrames

def extract_div_value_by_class(url, div_class_name, instance_number=1):
    """
    Extracts the text value from the Nth instance of a div element
    with the specified class name on a given URL. (Kept for general use)
    """
    if not isinstance(instance_number, int) or instance_number < 1:
        #print(f"Error: instance_number must be a positive integer. Received: {instance_number}")
        return None
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        all_target_divs = soup.find_all('div', class_=div_class_name)
        if len(all_target_divs) >= instance_number:
            target_div = all_target_divs[instance_number - 1]
            value = target_div.get_text(strip=True)
            return value
        else:
            # print(f"Could not find instance {instance_number} of div with class '{div_class_name}'. "
            #       f"Found {len(all_target_divs)} instance(s) on the page: {url}")
            return None
    except requests.exceptions.RequestException as e:
        #print(f"Error during requests to {url}: {e}")
        return None
    except Exception as e:
        #print(f"An unexpected error occurred while processing {url} with extract_div_value_by_class: {e}")
        return None

def extract_value_from_nested_div(url, parent_tag_name, parent_class_name, 
                                  child_tag_name, child_class_name, 
                                  parent_instance_number=1, child_instance_number=1):
    """
    Finds the Nth instance of a parent element by its tag and class,
    and then finds the Nth instance of a child element (by its tag and class)
    within that parent. Extracts the text from the child.
    (Kept for two-level nesting if needed)
    """
    if not isinstance(parent_instance_number, int) or parent_instance_number < 1:
        #print(f"Error: parent_instance_number must be a positive integer. Received: {parent_instance_number}")
        return None
    if not isinstance(child_instance_number, int) or child_instance_number < 1:
        #print(f"Error: child_instance_number must be a positive integer. Received: {child_instance_number}")
        return None
        
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        all_parent_elements = soup.find_all(parent_tag_name, class_=parent_class_name)

        if len(all_parent_elements) >= parent_instance_number:
            parent_element = all_parent_elements[parent_instance_number - 1] 
            
            all_child_elements = parent_element.find_all(child_tag_name, class_=child_class_name)
            
            if len(all_child_elements) >= child_instance_number:
                child_element = all_child_elements[child_instance_number - 1] 
                return child_element.get_text(strip=True)
            else:
                #print(f"Child instance {child_instance_number} of '{child_tag_name}.{child_class_name}' not found within parent instance {parent_instance_number} of '{parent_tag_name}.{parent_class_name}' on {url}. "
                #      f"Found {len(all_child_elements)} child instances.")
                return None
        else:
            #print(f"Parent instance {parent_instance_number} of '{parent_tag_name}.{parent_class_name}' not found on the page: {url}. "
            #      f"Found {len(all_parent_elements)} parent instances.")
            return None
    except requests.exceptions.RequestException as e:
        #print(f"Error during requests to {url}: {e}")
        return None
    except Exception as e:
        #print(f"An unexpected error occurred while processing {url} with extract_value_from_nested_div: {e}")
        return None

def extract_value_from_deeply_nested_div(url, 
                                         gp_tag, gp_class, gp_instance,
                                         p_tag, p_class, p_instance,
                                         c_tag, c_class, c_instance):
    """
    Extracts text from a deeply nested element:
    Nth child (c) within Nth parent (p) within Nth grandparent (gp).
    """
    if not all(isinstance(i, int) and i >= 1 for i in [gp_instance, p_instance, c_instance]):
        #print("Error: All instance numbers (grandparent, parent, child) must be positive integers.")
        return None
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # 1. Find Grandparent
        all_grandparent_elements = soup.find_all(gp_tag, class_=gp_class)
        if len(all_grandparent_elements) < gp_instance:
            #print(f"Grandparent instance {gp_instance} of '{gp_tag}.{gp_class}' not found on {url}. Found {len(all_grandparent_elements)}.")
            return None
        grandparent_element = all_grandparent_elements[gp_instance - 1]

        # 2. Find Parent within Grandparent
        all_parent_elements = grandparent_element.find_all(p_tag, class_=p_class)
        if len(all_parent_elements) < p_instance:
            #print(f"Parent instance {p_instance} of '{p_tag}.{p_class}' not found within grandparent instance {gp_instance} on {url}. Found {len(all_parent_elements)}.")
            return None
        parent_element = all_parent_elements[p_instance - 1]

        # 3. Find Child within Parent
        all_child_elements = parent_element.find_all(c_tag, class_=c_class)
        if len(all_child_elements) < c_instance:
            #print(f"Child instance {c_instance} of '{c_tag}.{c_class}' not found within parent instance {p_instance} (of grandparent instance {gp_instance}) on {url}. Found {len(all_child_elements)}.")
            return None
        child_element = all_child_elements[c_instance - 1]
        
        return child_element.get_text(strip=True)

    except requests.exceptions.RequestException as e:
        #print(f"Error during requests to {url}: {e}")
        return None
    except Exception as e:
        #print(f"An unexpected error occurred while processing {url} with extract_value_from_deeply_nested_div: {e}")
        return None


def clean_and_convert_value_to_numeric(value_input):
    """
    Cleans a string value by stripping text, handling suffixes (B, M, K),
    and currency symbols, then converts to a numeric type (float).
    Returns pd.NA if conversion is not possible or input is unsuitable.
    """
    if isinstance(value_input, (int, float)): # Already numeric
        return float(value_input)
    if not isinstance(value_input, str) or not value_input.strip():
        # Handles None, empty strings, or non-string types that aren't numeric
        return pd.NA

    value_str = value_input.strip()
    multiplier = 1.0
    numeric_part_str = value_str

    original_length = len(numeric_part_str)
    if numeric_part_str.upper().endswith('B'):
        multiplier = 1_000_000_000.0
        numeric_part_str = numeric_part_str[:-1].strip()
    elif numeric_part_str.upper().endswith('M'):
        multiplier = 1_000_000.0
        numeric_part_str = numeric_part_str[:-1].strip()
    elif numeric_part_str.upper().endswith('K'):
        multiplier = 1_000.0
        numeric_part_str = numeric_part_str[:-1].strip()
    
    if original_length > 0 and len(numeric_part_str) == 0 and original_length == 1 and value_str.upper() in ['B', 'M', 'K']:
         return pd.NA

    chars_to_remove_after_suffix = "$€£," 
    for char_to_remove in chars_to_remove_after_suffix:
        numeric_part_str = numeric_part_str.replace(char_to_remove, '')
    
    numeric_part_str = numeric_part_str.strip()

    if not numeric_part_str: 
        return pd.NA

    try:
        numeric_value = float(numeric_part_str)
        return numeric_value * multiplier
    except ValueError:
        return pd.NA 
    except Exception:
        return pd.NA


if __name__ == "__main__":
    # --- Dynamic URL Generation ---
    base_url_template = "https://valueinvesting.io/{TICKER}/valuation/dcf-growth-exit-5y"
    urls_to_scrape = [base_url_template.format(TICKER=value) for value in stock_list_vi]
    
    # --- Configuration for DEEPLY nested extraction ---
    # Grandparent: first instance of div class 'fs col-lg-2'
    gp_tag_config = 'div'
    gp_class_config = 'fs col-lg-2' # Class names with spaces are handled directly
    gp_instance_config = 1

    # Parent: second instance of div class 'price_square' (within grandparent)
    p_tag_config = 'div'
    p_class_config = 'price_square' # Using 'price_square' as requested
    p_instance_config = 2

    # Child: second instance of div class 'norm' (within parent)
    c_tag_config = 'div'
    c_class_config = 'norm'
    c_instance_config = 1
    
    results_data = [] 

    #print(f"Starting extraction for {len(urls_to_scrape)} URLs.")
    #print(f"Targeting: Child Inst#{c_instance_config} ('{c_tag_config}.{c_class_config}') "
    #      f"within Parent Inst#{p_instance_config} ('{p_tag_config}.{p_class_config}') "
    #      f"within Grandparent Inst#{gp_instance_config} ('{gp_tag_config}.{gp_class_config}')\n")

    for current_url in urls_to_scrape:
        #print(f"Processing URL: {current_url}")
        ticker = None
        try:
            url_parts = current_url.split('/')
            if len(url_parts) > 3 and url_parts[2].endswith("valueinvesting.io"): 
                ticker = url_parts[3]
            elif "nonexistentwebsite.com" in current_url:
                 ticker = "N/A_NonExistentSite"
            elif "google.com" in current_url: 
                ticker = "N/A_Google"
            else: 
                ticker = "N/A_UnknownFormat"
        except Exception as e_ticker:
            #print(f"  Could not extract ticker from URL {current_url}: {e_ticker}")
            ticker = "N/A_ExtractionError"

        # Using the new deeply nested extraction function
        raw_extracted_value = extract_value_from_deeply_nested_div(
            current_url,
            gp_tag_config, gp_class_config, gp_instance_config,
            p_tag_config, p_class_config, p_instance_config,
            c_tag_config, c_class_config, c_instance_config
        )
        
        processed_value_for_df = raw_extracted_value 

        results_data.append({
            "url": current_url,
            "ticker": ticker, 
            "target_grandparent": f"Inst#{gp_instance_config} {gp_tag_config}.{gp_class_config}",
            "target_parent": f"Inst#{p_instance_config} {p_tag_config}.{p_class_config}",
            "target_child": f"Inst#{c_instance_config} {c_tag_config}.{c_class_config}",
            "extracted_raw_value": raw_extracted_value,
            "processed_value": processed_value_for_df 
        })
        #print("-" * 40) 

    # write to dataframe.
    results_df = pd.DataFrame(results_data)

    # Split currency out from processed_value and convert value to numeric in USD terms.
    # Use regex to extract number and currency separately
    results_df[['value', 'currency']] = results_df['processed_value'].str.extract(r'([\d,\.]+)\s*([A-Z]{3})')

    # Clean the numeric values: remove commas and convert to float
    results_df['value'] = results_df['value'].str.replace(',', '', regex=False).astype(float)

    # Function to fetch FX rate to USD
    def get_fx_rate(currency_code):
        if pd.isna(currency_code):
            return np.nan
        ticker = f"{currency_code}USD=X"
        try:
            data = yf.Ticker(ticker).history(period="1d")
            return data['Close'].iloc[-1] if not data.empty else np.nan
        except Exception as e:
            print(f"Error fetching rate for {ticker}: {e}")
            return np.nan

    # Get FX rates
    results_df['fx_rate'] = results_df['currency'].apply(get_fx_rate)

    # Convert amounts to USD
    results_df['amount_usd'] = results_df['value'] * results_df['fx_rate']

#### Combine DCF values into analysis table and calulate opportunity.
Opportuniy = % difference between 5 Year Growth DCF Fair Value and current share price

In [438]:
# Step 1: Primary merge on 'ticker'
merged_df = successful_data_df.merge(
    results_df[['ticker', 'processed_value', 'amount_usd']],
    on='ticker',
    how='left'
)

# Step 2: Identify where fallback is needed
missing_mask = merged_df['processed_value'].isna() & successful_data_df['alt_ticker'].notna()

# Step 3: Get fallback values from alt_ticker
# Map alt_ticker to both processed_value and amount_usd
fallback_dict = results_df.set_index('ticker')[['processed_value', 'amount_usd']].to_dict(orient='index')

fallback_rows = successful_data_df.loc[missing_mask, 'alt_ticker'].map(fallback_dict)
fallback_df = pd.DataFrame(fallback_rows.tolist(), index=fallback_rows.index)

# Step 4: Fill missing values in merged_df
for col in ['processed_value', 'amount_usd']:
    merged_df.loc[missing_mask, col] = fallback_df[col]

# Step 5: Rename processed_value to dcf_5yg, leave amount_usd as-is
merged_df = merged_df.rename(columns={'processed_value': 'dcf_5yg'})

# Calculate opportunity (percentage diff between DCF fair value and share price).
merged_df['opportunity'] = np.where(
    merged_df['amount_usd'] != 0,
    ((merged_df['price'] - merged_df['amount_usd']) / merged_df['amount_usd']) * 100,
    np.nan  # or some default value or message
)

# Cleanup column names.
merged_df = merged_df.rename(columns={
    'price_to_book': 'pb',
    'amount_usd': 'dcf_5y_usd',
    'pe_ratio': 'pe'
})

#### Analysis: Explore DF, get top opportunities.

In [444]:
# Return key metrics, order by opportunity "margin of satefy to 5Y fair value".
merged_df[['ticker', 'price', 'pe', 'roe', 'profit_margin', 'dcf_5y_usd', 'opportunity']].sort_values(by='opportunity', ascending=True)

Unnamed: 0,ticker,price,pe,roe,profit_margin,dcf_5y_usd,opportunity
10,BIDU,84.39,8.372024,0.09374,0.19415,138.47,-39.055391
9,BABA,117.18,15.75,0.11438,0.13059,152.43,-23.125369
11,QCOM,148.34,15.121304,0.42205,0.2611,180.09,-17.630074
5,AMZN,205.7,33.55628,0.2524,0.1014,240.85,-14.594146
4,GOOGL,171.86,19.202234,0.34789,0.30857,196.58,-12.575033
1,BRK-B,506.18,13.494535,0.13187,0.21788,554.5,-8.714157
6,V,362.4,36.349045,0.50655,0.52859,324.15,11.800093
0,AAPL,199.95,31.14486,1.38015,0.24301,176.25,13.446809
2,NVDA,139.19,44.9,1.15463,0.51694,109.86,26.697615
12,JD,32.94,8.053789,0.16283,0.0376,24.12266,36.552103
