In [None]:


def parse_html_precise(filepath):
    """Enhanced HTML parsing with comprehensive financial data extraction"""
    with open(filepath, encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml')
    
    tables = soup.find_all('table')
    print(f"🔍 Found {len(tables)} tables")
    
    # Filter to likely financial tables
    financial_tables = [t for t in tables if is_likely_financial_table(t)]
    print(f"📊 {len(financial_tables)} tables appear to contain financial data")
    
    found = {key: 'N/A' for key in TARGET_VARIANTS.keys()}
    match_details = {}  # Store matching details for debugging

    # Remove empty or duplicate tables
    unique_tables = []
    seen_texts = set()
    for table in financial_tables:
        text = clean_text(table.get_text(" ", strip=True))
        if not text or text in seen_texts:
            continue
        seen_texts.add(text)
        unique_tables.append(table)
    financial_tables = unique_tables
    
    for table_idx, table in enumerate(financial_tables):
        print(f"📋 Processing table {table_idx + 1}")
        
        # Check each target that hasn't been found yet
        for target, variants in TARGET_VARIANTS.items():
            if found[target] != 'N/A':
                continue
                
            # Use hierarchical matching to find the best match in this table
            value = find_hierarchical_matches(table, target, variants)

            if value is None:
                value = find_vertical_match(table, target, variants)

            if value is not None:
                found[target] = value
                match_details[target] = {
                    'table': table_idx + 1,
                    'value': value
                }
                print(f"✅ Found {target}: {value}")

    
    # Compute derived values
    try:
        if found['Total Debt'] == 'N/A':
            short_debt = found.get("Short-Term Debt", 0) if found.get("Short-Term Debt") != 'N/A' else 0
            long_debt = found.get("Long-Term Debt", 0) if found.get("Long-Term Debt") != 'N/A' else 0
            if short_debt != 0 or long_debt != 0:
                found["Total Debt"] = float(short_debt) + float(long_debt)
                print(f"📊 Calculated Total Debt: {found['Total Debt']}")
                
        # Calculate Free Cash Flow if not found
        if found['Free Cash Flow'] == 'N/A':
            ocf = found.get("Operating Cash Flow", 0) if found.get("Operating Cash Flow") != 'N/A' else 0
            capex = found.get("Capital Expenditures (CapEx)", 0) if found.get("Capital Expenditures (CapEx)") != 'N/A' else 0
            if ocf != 0 and capex != 0:
                found["Free Cash Flow"] = float(ocf) - abs(float(capex))  # CapEx is usually negative
                print(f"📊 Calculated Free Cash Flow: {found['Free Cash Flow']}")
                
        # Calculate Gross Profit if not found
        if found['Gross Profit'] == 'N/A':
            revenue = found.get("Revenue", 0) if found.get("Revenue") != 'N/A' else 0
            cost_of_revenue = found.get("Cost of Revenue", 0) if found.get("Cost of Revenue") != 'N/A' else 0
            if revenue != 0 and cost_of_revenue != 0:
                found["Gross Profit"] = float(revenue) - float(cost_of_revenue)
                print(f"📊 Calculated Gross Profit: {found['Gross Profit']}")
                
    except Exception as e:
        print(f"⚠️ Error in calculations: {e}")
    
    # Print summary of what was found
    found_count = sum(1 for v in found.values() if v != 'N/A')
    print(f"\n📈 Extracted {found_count}/{len(TARGET_VARIANTS)} financial metrics")
    
    return found


In [None]:
def process_all_tickers():
    cik_df = get_cik_mapping()
    all_data = []

    for ticker in TICKERS:
        print(f"\n📊 Processing {ticker}")
        try:
            cik, company = find_cik(ticker, cik_df)
            if not cik:
                print(f"❌ No CIK for {ticker}")
                continue
            content, form, date = download_latest_filing(cik)
            path = save_filing(content, ticker, form, date)
            data = parse_html_precise(path)

            record = {"Company": company, "Ticker": ticker, "Filing Date": date, "Filing Type": form}
            record.update(data)
            all_data.append(record)
            time.sleep(1)  # respectful delay
        except Exception as e:
            print(f"⚠️ Error for {ticker}: {e}")

    if all_data:
        df = pd.DataFrame(all_data)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        file = os.path.join(DOWNLOAD_DIR, f"financials_{timestamp}.xlsx")
        df.to_excel(file, index=False)
        print(f"\n✅ All data saved to: {file}")
        
        # Print comprehensive summary of what was found
        print("\n📈 Comprehensive Extraction Summary:")
        print("=" * 50)
        
        # Group by category
        categories = {
            "Balance Sheet - Assets": ["Current Assets", "Total Assets", "Goodwill", "Intangible Assets", 
                                     "Property, Plant, and Equipment (Net)", "Cash and Cash Equivalents", 
                                     "Accounts Receivable", "Inventory"],
            "Balance Sheet - Liabilities": ["Current Liabilities", "Total Liabilities", "Short-Term Debt", 
                                          "Long-Term Debt", "Total Debt", "Accounts Payable"],
            "Balance Sheet - Equity": ["Total Equity", "Treasury Stock", "Retained Earnings", "Preferred Stock", 
                                     "Common Shares Outstanding", "Book Value of Equity", 
                                     "Accumulated Other Comprehensive Income"],
            "Income Statement": ["Revenue", "Net Income", "Operating Income (EBIT)", "Income Before Tax", 
                               "Income Tax Expense", "Research and Development Expense", "Cost of Revenue", 
                               "Gross Profit", "Comprehensive Income"],
            "Cash Flow Statement": ["Operating Cash Flow", "Capital Expenditures (CapEx)", 
                                  "Depreciation & Amortization", "Free Cash Flow"]
        }
        
        for category, fields in categories.items():
            print(f"\n{category}:")
            found_in_category = 0
            for field in fields:
                if field in TARGET_VARIANTS:
                    found_count = sum(1 for row in all_data if row.get(field) != 'N/A')
                    status = "✅" if found_count > 0 else "❌"
                    print(f"  {status} {field}: {found_count}/{len(all_data)} companies")
                    if found_count > 0:
                        found_in_category += 1
            print(f"  📊 Category Success: {found_in_category}/{len(fields)} fields found")
        
        # Overall statistics
        total_possible = len(TARGET_VARIANTS) * len(all_data)
        total_found = sum(1 for row in all_data for key in TARGET_VARIANTS.keys() if row.get(key) != 'N/A')
        success_rate = (total_found / total_possible) * 100
        print(f"\n🎯 Overall Success Rate: {success_rate:.1f}% ({total_found}/{total_possible} data points)")
        
        # Show companies with most/least data
        company_scores = []
        for row in all_data:
            found_count = sum(1 for key in TARGET_VARIANTS.keys() if row.get(key) != 'N/A')
            company_scores.append((row['Company'], found_count))
        
        company_scores.sort(key=lambda x: x[1], reverse=True)
        print(f"\n🏆 Data Coverage by Company:")
        for company, count in company_scores:
            percentage = (count / len(TARGET_VARIANTS)) * 100
            print(f"  {company}: {count}/{len(TARGET_VARIANTS)} ({percentage:.1f}%)")
        
        print("=" * 50)
    else:
        print("❌ No data extracted")

Appending the data to final excel sheet and saving the file 


In [None]:
# === Append to Excel Master Sheet ===
def append_to_master_excel(new_data, output_file):
    new_df = pd.DataFrame(new_data)
    if os.path.exists(output_file):
        existing_df = pd.read_excel(output_file)
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        combined_df = new_df
    combined_df.to_excel(output_file, index=False)
    print(f"\n✅ Data appended to: {output_file}")

# === Processing Logic ===
def process_all_tickers():
    cik_df = get_cik_mapping()
    all_data = []

    for ticker in TICKERS:
        print(f"\n📊 Processing {ticker}")
        try:
            cik, company = find_cik(ticker, cik_df)
            if not cik:
                continue
            content, form, date = download_latest_filing(cik)
            path = save_filing(content, ticker, form, date)
            data = parse_html_precise(path)

            record = {"Company": company, "Ticker": ticker, "Filing Date": date, "Filing Type": form}
            record.update(data)
            all_data.append(record)
            time.sleep(1)
        except Exception as e:
            print(f"⚠️ Error for {ticker}: {e}")

    if all_data:
        append_to_master_excel(all_data, OUTPUT_FILE)

# === MAIN ===
if __name__ == "__main__":
    print("🚀 Starting financial data extraction")
    process_all_tickers()