Target Variants are important as the it should match with the content in HTML file in order to get the correct data.

In [None]:
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
TARGET_VARIANTS = {
    # Assets
    "Current Assets": [
        "current assets", "total current assets", "current assets total"
    ],
    "Non-Current Assets": [
        "non-current assets", "non current assets", "long-term assets", "long term assets",
        "noncurrent assets", "total non-current assets", "total long-term assets", 
        "total long term assets", "total noncurrent assets"
    ],
    "Total Assets": [
        "total assets", "assets total", "total consolidated assets"
    ],
    "Cash and Cash Equivalents": [
        "cash and cash equivalents", "cash and equivalents", "cash & cash equivalents",
        "cash, cash equivalents, and short-term investments",
        "cash, cash equivalents and short-term investments",
        "cash and short-term investments", "cash & short-term investments"
    ],
    "Property, Plant, and Equipment (Net)": [
        "property and equipment, net", "property, plant and equipment, net",
        "property, plant & equipment, net", "property plant and equipment net",
        "pp&e", "ppe", "property and equipment net", "plant and equipment net",
        "property plant equipment net", "fixed assets net"
    ],
    "Goodwill": [
        "goodwill", "goodwill net", "goodwill and intangible assets"
    ],
    "Intangible Assets": [
        "intangible assets, net", "intangible assets net", "other intangible assets",
        "acquired intangible assets", "intangible assets", "intangibles net"
    ],
    "Accounts Receivable": [
        "accounts receivable", "accounts receivable net", "trade receivables", 
        "receivables", "trade and other receivables", "accounts receivable, net"
    ],
    "Inventory": [
        "inventory", "inventories", "finished goods", "raw materials",
        "inventory net", "total inventory"
    ],

    # Liabilities
    "Current Liabilities": [
        "current liabilities", "total current liabilities", "current liabilities total"
    ],
    "Non-Current Liabilities": [
        "non-current liabilities", "non current liabilities", "long term liabilities",
        "noncurrent liabilities", "total non-current liabilities", 
        "total long-term liabilities", "other long-term liabilities",
        "long-term liabilities", "total noncurrent liabilities"
    ],
    "Total Liabilities": [
        "total liabilities", "liabilities total", "total consolidated liabilities"
    ],
    "Short-Term Debt": [
        "short-term debt", "short term debt", "current portion of long-term debt",
        "current debt", "current borrowings", "short-term borrowings",
        "current portion of debt", "debt due within one year"
    ],
    "Long-Term Debt": [
        "long-term debt", "long term debt", "long-term borrowings",
        "long-term obligations", "noncurrent debt", "debt securities",
        "long-term debt securities", "term debt"
    ],
    "Accounts Payable": [
        "accounts payable", "trade payables", "accounts payable and accrued liabilities",
        "trade and other payables"
    ],
    "Total Debt": [
        "total debt", "total borrowings", "total debt securities"
    ],

    # Equity
    "Total Equity": [
        "total stockholders' equity", "total shareholders' equity", "total equity",
        "stockholders' equity", "shareholders' equity", "total shareholders equity",
        "total stockholders equity", "stockholders equity", "shareholders equity"
    ],
    "Treasury Stock": [
        "treasury stock", "treasury shares", "shares held in treasury"
    ],
    "Retained Earnings": [
        "retained earnings", "accumulated deficit", "accumulated earnings",
        "retained earnings (accumulated deficit)"
    ],
    "Preferred Stock": [
        "preferred stock", "preference shares", "preferred shares"
    ],
    "Common Shares Outstanding": [
        "common shares outstanding", "common stock outstanding", "ordinary shares outstanding",
        "common stock and paid-in capital", "common stock", "ordinary shares"
    ],
    "Book Value of Equity": [
        "book value of equity", "stockholders equity", "net worth"
    ],
    "Accumulated Other Comprehensive Income": [
        "accumulated other comprehensive income", "accumulated other comprehensive loss",
        "accumulated other comprehensive income (loss)", "aoci"
    ],

    # Income Statement
    "Revenue": [
        "total revenue", "revenue", "net revenue", "net sales", "total net sales",
        "sales", "total sales", "operating revenue", "service revenue"
    ],
    "Cost of Revenue": [
        "cost of revenue", "cost of sales", "cost of goods sold", "cogs",
        "cost of services", "cost of products sold"
    ],
    "Gross Profit": [
        "gross margin", "gross profit", "gross income"
    ],
    "Operating Income (EBIT)": [
        "operating income", "income from operations", "operating profit",
        "earnings before interest and taxes", "ebit", "operating earnings"
    ],
    "Net Income": [
        "net income", "net earnings", "net income (loss)", "net profit",
        "profit for the year", "profit attributable to shareholders"
    ],
    "Research and Development Expense": [
        "research and development", "r&d expense", "research and development expense",
        "research and development costs"
    ],
    "Income Before Tax": [
        "income before income taxes", "income before tax", "earnings before tax",
        "profit before tax", "income before provision for income taxes"
    ],
    "Income Tax Expense": [
        "provision for income taxes", "income tax expense", "income taxes",
        "tax expense", "income tax provision"
    ],
    "Comprehensive Income": [
        "comprehensive income", "total comprehensive income", "comprehensive earnings"
    ],

    # Cash Flow
    "Operating Cash Flow": [
        "net cash from operations", "net cash provided by operating activities",
        "operating cash flow", "cash flows from operating activities",
        "net cash from operating activities"
    ],
    "Capital Expenditures (CapEx)": [
        "additions to property and equipment", "purchases of property and equipment",
        "capital expenditures", "capex", "capital investments", "capital additions"
    ],
    "Depreciation & Amortization": [
        "depreciation, amortization, and other", "depreciation and amortization",
        "amortization", "depreciation", "depreciation expense"
    ],
    "Free Cash Flow": [
        "free cash flow", "cash flow from operations less capex"
    ],
    "NOPAT": [
        "nopat", "net operating profit after tax", "net operating profit"
    ]
}

EXCLUSION_PATTERNS = {
    "Total Assets": ["deferred tax assets", "other assets", "current assets", "non-current assets"],
    "Total Liabilities": ["deferred tax liabilities", "other liabilities", "current liabilities", "non-current liabilities"],
    "Total Equity": ["stockholders' deficit", "shareholders' deficit", "equity method"],
    "Revenue": ["deferred revenue", "unearned revenue", "other revenue"],
    "Net Income": ["other comprehensive income", "loss", "deficit"],
    "Operating Income (EBIT)": ["non-operating income", "other income"],
    "Cash and Cash Equivalents": ["restricted cash", "cash flows"],
    "Current Assets": ["non-current assets", "total assets"],
    "Current Liabilities": ["non-current liabilities", "total liabilities"],
    "Short-Term Debt": ["long-term debt", "total debt"],
    "Long-Term Debt": ["short-term debt", "current portion"],
    "Comprehensive Income": ["other comprehensive income", "accumulated other comprehensive"]
}



