In [10]:
import os
import re
import pandas as pd
import numpy as np

# Load the KPI value table
values_df = pd.read_csv("../results-pipeline/kpi-value-table.csv", sep=";", encoding="utf-8")

# Load the wskaznik (indicator) dictionary
wskaznik_dict_df = pd.read_csv("../results-pipeline/wskaznik_dictionary.csv", sep=";", encoding="utf-8")

# Display basic info about the loaded data
print(f"Values DataFrame shape: {values_df.shape}")
print(f"Wskaznik Dictionary shape: {wskaznik_dict_df.shape}")
print("\nFirst few rows of values_df:")
print(values_df.head())
print("\nFirst few rows of wskaznik_dict_df:")
print(wskaznik_dict_df.head())

values_df.dtypes

Values DataFrame shape: (835883, 4)
Wskaznik Dictionary shape: (53, 3)

First few rows of values_df:
    rok  wartosc  WSKAZNIK_INDEX  PKD_INDEX
0  2005  1828.70               0        0.0
1  2006  2256.83               0        0.0
2  2007  2027.13               0        0.0
3  2008  1978.48               0        0.0
4  2009  2061.13               0        0.0

First few rows of wskaznik_dict_df:
   WSKAZNIK_INDEX                                           WSKAZNIK MinMax
0               0                    C Środki pieniężne i pap. wart.    Max
1               1                              CF Nadwyżka finansowa    Max
2               2                                   DEPR Amortyzacja    Min
3               3                  EN Liczba jednostek gospodarczych    Max
4               4  GS (I) Przychody netto ze sprzedaży i zrównane...    Max


rok                 int64
wartosc           float64
WSKAZNIK_INDEX      int64
PKD_INDEX         float64
dtype: object

In [11]:
# Create dynamic indicator mapping based on actual wskaznik names
# This searches for indicators by their short code (e.g., 'C', 'NP', etc.)
indicator_mapping = {}

indicator_codes = {
    'C': 'C Środki pieniężne',      # Środki pieniężne i pap. wart.
    'CF': 'CF Nadwyżka finansowa',  # Nadwyżka finansowa
    'DEPR': 'DEPR Amortyzacja',     # Amortyzacja
    'EN': 'EN Liczba jednostek',    # Liczba jednostek gospodarczych
    'GS': 'GS Przychody ogółem',    # Przychody ogółem
    'GS_I': 'GS (I) Przychody netto ze sprzedaży',  # Przychody netto ze sprzedaży
    'INV': 'INV Zapasy',            # Zapasy
    'IO': 'IO Wartość nakładów',    # Wartość nakładów inwestycyjnych
    'IP': 'IP Odsetki',             # Odsetki do zapłacenia
    'LTC': 'LTC Długoterminowe kredyty',  # Długoterminowe kredyty bankowe
    'LTL': 'LTL Zobowiązania długoterminowe',  # Zobowiązania długoterminowe
    'NP': 'NP Wynik finansowy netto',  # Wynik finansowy netto (zysk netto)
    'NWC': 'NWC Kapitał obrotowy',  # Kapitał obrotowy
    'OFE': 'OFE Pozostałe koszty',  # Pozostałe koszty finansowe
    'OP': 'OP Wynik na działalności operacyjnej',  # Wynik na działalności operacyjnej
    'PEN': 'PEN Liczba rentownych', # Liczba rentownych jednostek gospodarczych
    'PNPM': 'PNPM Przychody netto', # Przychody netto
    'POS': 'POS Wynik na sprzedaży',# Wynik na sprzedaży
    'PPO': 'PPO Pozostałe przychody',  # Pozostałe przychody operacyjne
    'REC': 'REC Należności',        # Należności krótkoterminowe
    'STC': 'STC Krótkoterminowe kredyty',  # Krótkoterminowe kredyty bankowe
    'STL': 'STL Zobowiązania krótkoterminowe',  # Zobowiązania krótkoterminowe
    'TC': 'TC Koszty ogółem',       # Koszty ogółem
    'UPADLOSC': 'Upadłość',         # Upadłość
    'ZAMKNIETE': 'Liczba firm zamkniętych',  # Liczba firm zamkniętych
    'ZAWIESZONE': 'Liczba firm z zawieszoną',  # Liczba firm z zawieszoną działalnością
    'ZAREJESTROWANE': 'Liczba firm zarejestrowanych',  # Liczba firm zarejestrowanych
    'NOWE': 'Liczba nowych firm',   # Liczba nowych firm
    'PRZYCHFIN': 'Przych. fin.',    # Przychody finansowe
    'PRACUJACY': 'Przewidywana liczba pracujących Ogółem'  # Liczba pracujących
}

# Build the mapping by finding each indicator in the dictionary
for code, search_pattern in indicator_codes.items():
    matching_rows = wskaznik_dict_df[wskaznik_dict_df['WSKAZNIK'].str.contains(search_pattern, case=False, na=False)]
    if not matching_rows.empty:
        indicator_mapping[code] = matching_rows.iloc[0]['WSKAZNIK_INDEX']
    else:
        print(f"WARNING: Could not find indicator for '{code}' (searched for: '{search_pattern}')")

print("Indicator mapping created:")
for code, idx in sorted(indicator_mapping.items()):
    print(f"  {code}: {idx}")

# Check missing values before imputation
print("\nMissing values before imputation:")
print(f"Total missing values: {values_df['wartosc'].isna().sum()}")
print(f"Percentage: {values_df['wartosc'].isna().sum() / len(values_df) * 100:.2f}%\n")

# Create a copy for imputation
values_imputed = values_df.copy()

# Step 1: Interpolate missing values within same WSKAZNIK_INDEX and PKD_INDEX group
print("Step 1: Interpolating within same indicator and PKD group...")
values_imputed = values_imputed.sort_values(['WSKAZNIK_INDEX', 'PKD_INDEX', 'rok'])

# Apply interpolation for each group
values_imputed['wartosc'] = values_imputed.groupby(['WSKAZNIK_INDEX', 'PKD_INDEX'])['wartosc'].transform(
    lambda group: group.interpolate(method='linear', limit_direction='both')
)

print(f"Missing values after interpolation: {values_imputed['wartosc'].isna().sum()}")

# Step 2: Fill remaining missing values with median for that WSKAZNIK_INDEX in that year
print("Step 2: Filling remaining gaps with median by indicator and year...")

# Calculate median for each WSKAZNIK_INDEX and year
median_by_indicator_year = values_imputed.groupby(['WSKAZNIK_INDEX', 'rok'])['wartosc'].transform('median')

# Fill remaining NaN values with the median
values_imputed['wartosc'] = values_imputed['wartosc'].fillna(median_by_indicator_year)

print(f"Missing values after median imputation: {values_imputed['wartosc'].isna().sum()}")

# Step 3: If still any NaN (edge cases), fill with overall median for that indicator
if values_imputed['wartosc'].isna().sum() > 0:
    print("Step 3: Filling final gaps with overall indicator median...")
    median_by_indicator = values_imputed.groupby('WSKAZNIK_INDEX')['wartosc'].transform('median')
    values_imputed['wartosc'] = values_imputed['wartosc'].fillna(median_by_indicator)
    print(f"Missing values after overall median: {values_imputed['wartosc'].isna().sum()}")

# Step 4: As a last resort, fill any remaining NaN with 0
if values_imputed['wartosc'].isna().sum() > 0:
    print("Step 4: Filling any final remaining gaps with 0...")
    values_imputed['wartosc'] = values_imputed['wartosc'].fillna(0)
    print(f"Final missing values: {values_imputed['wartosc'].isna().sum()}")

print("\n" + "="*80)
print("IMPUTATION COMPLETE")
print("="*80)

# Update values_df with imputed values
values_df = values_imputed.copy()

# Pivot the data to have indicators as columns for easier calculation
pivot_df = values_df.pivot_table(
    index=['rok', 'PKD_INDEX'],
    columns='WSKAZNIK_INDEX',
    values='wartosc',
    aggfunc='mean'  # Use 'mean' to handle any potential duplicates
).reset_index()

# Define the 8 new indicators starting from index 1000
new_indicators = []

# Helper function for safe division
def safe_divide(numerator, denominator, fill_value=0):
    """
    Safely divide, replacing inf/nan with fill_value.
    """
    result = numerator / denominator
    # Replace inf and nan
    result = result.replace([np.inf, -np.inf], fill_value)
    result = result.fillna(fill_value)
    return result

print("\n" + "="*80)
print("CALCULATING NEW INDICATORS WITH SAFE DIVISION")
print("="*80)

# Helper function to get MinMax from wskaznik_dictionary
def get_minmax(indicator_code):
    """Get MinMax value for an indicator from the dictionary"""
    idx = indicator_mapping.get(indicator_code)
    if idx is not None:
        minmax_row = wskaznik_dict_df[wskaznik_dict_df['WSKAZNIK_INDEX'] == idx]
        if not minmax_row.empty:
            return minmax_row.iloc[0].get('MinMax', 'Max')
    return 'Max'  # Default to Max if not found

# =============================================================================
# ZESTAW 1: ZDOLNOŚĆ KREDYTOWA I PŁYNNOŚĆ (indices 1000-1019)
# =============================================================================

# 1000. Net Profit Margin (NP/PNPM)
pivot_df['indicator_1000'] = safe_divide(pivot_df[indicator_mapping['NP']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1000, 'WSKAZNIK': 'Net Profit Margin (NP/PNPM)', 'MinMax': get_minmax('NP')})
print("✓ 1000: Net Profit Margin")

# 1001. Operating Margin (OP/PNPM)
pivot_df['indicator_1001'] = safe_divide(pivot_df[indicator_mapping['OP']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1001, 'WSKAZNIK': 'Operating Margin (OP/PNPM)', 'MinMax': get_minmax('OP')})
print("✓ 1001: Operating Margin")

# 1002. Current Ratio (CR) = (C+REC+INV)/STL
pivot_df['indicator_1002'] = safe_divide(
    pivot_df[indicator_mapping['C']] + pivot_df[indicator_mapping['REC']] + pivot_df[indicator_mapping['INV']],
    pivot_df[indicator_mapping['STL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1002, 'WSKAZNIK': 'Current Ratio (C+REC+INV)/STL', 'MinMax': 'Max'})
print("✓ 1002: Current Ratio")

# 1003. Quick Ratio (QR) = (C+REC)/STL
pivot_df['indicator_1003'] = safe_divide(
    pivot_df[indicator_mapping['C']] + pivot_df[indicator_mapping['REC']],
    pivot_df[indicator_mapping['STL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1003, 'WSKAZNIK': 'Quick Ratio (C+REC)/STL', 'MinMax': 'Max'})
print("✓ 1003: Quick Ratio")

# 1004. Cash Ratio = C/STL
pivot_df['indicator_1004'] = safe_divide(pivot_df[indicator_mapping['C']], pivot_df[indicator_mapping['STL']])
new_indicators.append({'WSKAZNIK_INDEX': 1004, 'WSKAZNIK': 'Cash Ratio (C/STL)', 'MinMax': 'Max'})
print("✓ 1004: Cash Ratio")

# 1005. Short Debt Share = STL/(STL+LTL)
pivot_df['indicator_1005'] = safe_divide(
    pivot_df[indicator_mapping['STL']],
    pivot_df[indicator_mapping['STL']] + pivot_df[indicator_mapping['LTL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1005, 'WSKAZNIK': 'Short Debt Share (STL/(STL+LTL))', 'MinMax': 'Min'})
print("✓ 1005: Short Debt Share")

# 1006. Long-term Debt Share = LTL/(STL+LTL)
pivot_df['indicator_1006'] = safe_divide(
    pivot_df[indicator_mapping['LTL']],
    pivot_df[indicator_mapping['STL']] + pivot_df[indicator_mapping['LTL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1006, 'WSKAZNIK': 'Long-term Debt Share (LTL/(STL+LTL))', 'MinMax': 'Max'})
print("✓ 1006: Long-term Debt Share")

# 1007. Interest Coverage = OP/IP
pivot_df['indicator_1007'] = safe_divide(pivot_df[indicator_mapping['OP']], pivot_df[indicator_mapping['IP']])
new_indicators.append({'WSKAZNIK_INDEX': 1007, 'WSKAZNIK': 'Interest Coverage (OP/IP)', 'MinMax': get_minmax('OP')})
print("✓ 1007: Interest Coverage")

# 1008. Financial Risk Ratio = OFE/OP
pivot_df['indicator_1008'] = safe_divide(pivot_df[indicator_mapping['OFE']], pivot_df[indicator_mapping['OP']])
new_indicators.append({'WSKAZNIK_INDEX': 1008, 'WSKAZNIK': 'Financial Risk Ratio (OFE/OP)', 'MinMax': 'Min'})
print("✓ 1008: Financial Risk Ratio")

# 1009. Cash Flow Margin = CF/PNPM
pivot_df['indicator_1009'] = safe_divide(pivot_df[indicator_mapping['CF']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1009, 'WSKAZNIK': 'Cash Flow Margin (CF/PNPM)', 'MinMax': get_minmax('CF')})
print("✓ 1009: Cash Flow Margin")

# 1010. Operating Cash Coverage = (OP+DEPR)/(STL+LTL)
pivot_df['indicator_1010'] = safe_divide(
    pivot_df[indicator_mapping['OP']] + pivot_df[indicator_mapping['DEPR']],
    pivot_df[indicator_mapping['STL']] + pivot_df[indicator_mapping['LTL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1010, 'WSKAZNIK': 'Operating Cash Coverage ((OP+DEPR)/(STL+LTL))', 'MinMax': 'Max'})
print("✓ 1010: Operating Cash Coverage")

# 1011. Bankruptcy Rate = Upadłość/EN
if 'UPADLOSC' in indicator_mapping and 'EN' in indicator_mapping:
    pivot_df['indicator_1011'] = safe_divide(pivot_df[indicator_mapping['UPADLOSC']], pivot_df[indicator_mapping['EN']])
    new_indicators.append({'WSKAZNIK_INDEX': 1011, 'WSKAZNIK': 'Bankruptcy Rate (Upadłość/EN)', 'MinMax': 'Min'})
    print("✓ 1011: Bankruptcy Rate")

# 1012. Closure Rate = Zamknięte/EN
if 'ZAMKNIETE' in indicator_mapping and 'EN' in indicator_mapping:
    pivot_df['indicator_1012'] = safe_divide(pivot_df[indicator_mapping['ZAMKNIETE']], pivot_df[indicator_mapping['EN']])
    new_indicators.append({'WSKAZNIK_INDEX': 1012, 'WSKAZNIK': 'Closure Rate (Zamknięte/EN)', 'MinMax': 'Min'})
    print("✓ 1012: Closure Rate")

# 1013. Profit Firms Share = PEN/EN
pivot_df['indicator_1013'] = safe_divide(pivot_df[indicator_mapping['PEN']], pivot_df[indicator_mapping['EN']])
new_indicators.append({'WSKAZNIK_INDEX': 1013, 'WSKAZNIK': 'Profit Firms Share (PEN/EN)', 'MinMax': 'Max'})
print("✓ 1013: Profit Firms Share")

# =============================================================================
# ZESTAW 2: EFEKTYWNOŚĆ OPERACYJNA I RENTOWNOŚĆ (indices 1020-1039)
# =============================================================================

# 1020. Sales Profitability = POS/PNPM
pivot_df['indicator_1020'] = safe_divide(pivot_df[indicator_mapping['POS']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1020, 'WSKAZNIK': 'Sales Profitability (POS/PNPM)', 'MinMax': get_minmax('POS')})
print("✓ 1020: Sales Profitability")

# 1021. Core Revenue Share = GS(I)/GS
if 'GS_I' in indicator_mapping and 'GS' in indicator_mapping:
    pivot_df['indicator_1021'] = safe_divide(pivot_df[indicator_mapping['GS_I']], pivot_df[indicator_mapping['GS']])
    new_indicators.append({'WSKAZNIK_INDEX': 1021, 'WSKAZNIK': 'Core Revenue Share (GS(I)/GS)', 'MinMax': 'Max'})
    print("✓ 1021: Core Revenue Share")

# 1022. Cost Share Ratio = TC/PNPM
pivot_df['indicator_1022'] = safe_divide(pivot_df[indicator_mapping['TC']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1022, 'WSKAZNIK': 'Cost Share Ratio (TC/PNPM)', 'MinMax': 'Min'})
print("✓ 1022: Cost Share Ratio")

# 1023. Receivables Turnover = PNPM/REC
pivot_df['indicator_1023'] = safe_divide(pivot_df[indicator_mapping['PNPM']], pivot_df[indicator_mapping['REC']])
new_indicators.append({'WSKAZNIK_INDEX': 1023, 'WSKAZNIK': 'Receivables Turnover (PNPM/REC)', 'MinMax': 'Max'})
print("✓ 1023: Receivables Turnover")

# 1024. Inventory Turnover = TC/INV
pivot_df['indicator_1024'] = safe_divide(pivot_df[indicator_mapping['TC']], pivot_df[indicator_mapping['INV']])
new_indicators.append({'WSKAZNIK_INDEX': 1024, 'WSKAZNIK': 'Inventory Turnover (TC/INV)', 'MinMax': 'Max'})
print("✓ 1024: Inventory Turnover")

# 1025. Current Asset Turnover = PNPM/(C+REC+INV)
pivot_df['indicator_1025'] = safe_divide(
    pivot_df[indicator_mapping['PNPM']],
    pivot_df[indicator_mapping['C']] + pivot_df[indicator_mapping['REC']] + pivot_df[indicator_mapping['INV']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1025, 'WSKAZNIK': 'Current Asset Turnover (PNPM/(C+REC+INV))', 'MinMax': 'Max'})
print("✓ 1025: Current Asset Turnover")

# 1026. Investment Ratio = IO/PNPM
pivot_df['indicator_1026'] = safe_divide(pivot_df[indicator_mapping['IO']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1026, 'WSKAZNIK': 'Investment Ratio (IO/PNPM)', 'MinMax': 'Max'})
print("✓ 1026: Investment Ratio")

# 1027. Financial Revenue Share = Przych.fin./GS
if 'PRZYCHFIN' in indicator_mapping and 'GS' in indicator_mapping:
    pivot_df['indicator_1027'] = safe_divide(pivot_df[indicator_mapping['PRZYCHFIN']], pivot_df[indicator_mapping['GS']])
    new_indicators.append({'WSKAZNIK_INDEX': 1027, 'WSKAZNIK': 'Financial Revenue Share (Przych.fin./GS)', 'MinMax': 'Max'})
    print("✓ 1027: Financial Revenue Share")

# 1028. Net Firm Growth Rate = (Zarejestrowane - Zamknięte)/EN
if 'ZAREJESTROWANE' in indicator_mapping and 'ZAMKNIETE' in indicator_mapping and 'EN' in indicator_mapping:
    pivot_df['indicator_1028'] = safe_divide(
        pivot_df[indicator_mapping['ZAREJESTROWANE']] - pivot_df[indicator_mapping['ZAMKNIETE']],
        pivot_df[indicator_mapping['EN']]
    )
    new_indicators.append({'WSKAZNIK_INDEX': 1028, 'WSKAZNIK': 'Net Firm Growth Rate ((Zarejestrowane-Zamknięte)/EN)', 'MinMax': 'Max'})
    print("✓ 1028: Net Firm Growth Rate")

# 1029. Average Firm Size = Pracujący/EN
if 'PRACUJACY' in indicator_mapping and 'EN' in indicator_mapping:
    pivot_df['indicator_1029'] = safe_divide(pivot_df[indicator_mapping['PRACUJACY']], pivot_df[indicator_mapping['EN']])
    new_indicators.append({'WSKAZNIK_INDEX': 1029, 'WSKAZNIK': 'Average Firm Size (Pracujący/EN)', 'MinMax': 'Max'})
    print("✓ 1029: Average Firm Size")

# =============================================================================
# ZESTAW 3: ROZWÓJ BRANŻY, STABILNOŚĆ I POTENCJAŁ WZROSTU (indices 1040-1059)
# =============================================================================

# 1040. Amortization Ratio = DEPR/PNPM
pivot_df['indicator_1040'] = safe_divide(pivot_df[indicator_mapping['DEPR']], pivot_df[indicator_mapping['PNPM']])
new_indicators.append({'WSKAZNIK_INDEX': 1040, 'WSKAZNIK': 'Amortization Ratio (DEPR/PNPM)', 'MinMax': 'Min'})
print("✓ 1040: Amortization Ratio")

# 1041. New Firms Rate = Nowe/EN
if 'NOWE' in indicator_mapping and 'EN' in indicator_mapping:
    pivot_df['indicator_1041'] = safe_divide(pivot_df[indicator_mapping['NOWE']], pivot_df[indicator_mapping['EN']])
    new_indicators.append({'WSKAZNIK_INDEX': 1041, 'WSKAZNIK': 'New Firms Rate (Nowe/EN)', 'MinMax': 'Max'})
    print("✓ 1041: New Firms Rate")

# 1042. Suspension Rate = Zawieszone/EN
if 'ZAWIESZONE' in indicator_mapping and 'EN' in indicator_mapping:
    pivot_df['indicator_1042'] = safe_divide(pivot_df[indicator_mapping['ZAWIESZONE']], pivot_df[indicator_mapping['EN']])
    new_indicators.append({'WSKAZNIK_INDEX': 1042, 'WSKAZNIK': 'Suspension Rate (Zawieszone/EN)', 'MinMax': 'Min'})
    print("✓ 1042: Suspension Rate")

# 1043. Bank Loans Ratio = (STC+LTC)/(STL+LTL)
pivot_df['indicator_1043'] = safe_divide(
    pivot_df[indicator_mapping['STC']] + pivot_df[indicator_mapping['LTC']],
    pivot_df[indicator_mapping['STL']] + pivot_df[indicator_mapping['LTL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1043, 'WSKAZNIK': 'Bank Loans Ratio ((STC+LTC)/(STL+LTL))', 'MinMax': 'Max'})
print("✓ 1043: Bank Loans Ratio")

print("\n" + "="*80)
print(f"CALCULATED {len(new_indicators)} NEW INDICATORS")
print("="*80)
print("Division by zero handled: inf/nan values replaced with 0")

# Convert back to long format
new_indicators_data = []

for new_ind in new_indicators:
    indicator_idx = new_ind['WSKAZNIK_INDEX']
    col_name = f'indicator_{indicator_idx}'
    if col_name in pivot_df.columns:
        temp_df = pivot_df[['rok', 'PKD_INDEX', col_name]].copy()
        temp_df.columns = ['rok', 'PKD_INDEX', 'wartosc']
        temp_df['WSKAZNIK_INDEX'] = indicator_idx
        new_indicators_data.append(temp_df)

# Combine all new indicators
new_indicators_df = pd.concat(new_indicators_data, ignore_index=True)

# Select only the columns we need
new_indicators_df = new_indicators_df[['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']]

# Combine with original data (now imputed)
combined_values_df = pd.concat([values_df[['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']], 
                                new_indicators_df], ignore_index=True)

# Update the wskaznik dictionary
new_wskaznik_dict = pd.DataFrame(new_indicators)
combined_wskaznik_dict = pd.concat([wskaznik_dict_df, new_wskaznik_dict], ignore_index=True)

# Display summary
print(f"\nOriginal values_df shape: {len(values_df)}")
print(f"New indicators added: {len(new_indicators_df)}")
print(f"Combined values_df shape: {combined_values_df.shape}")
print(f"\nOriginal wskaznik dictionary entries: {len(wskaznik_dict_df)}")
print(f"New wskaznik dictionary entries: {len(combined_wskaznik_dict)}")
print("\n" + "="*80)
print("NEW INDICATORS ADDED:")
print("="*80)
for idx, row in new_wskaznik_dict.iterrows():
    minmax_status = row.get('MinMax', 'N/A')
    print(f"Index {row['WSKAZNIK_INDEX']}: {row['WSKAZNIK']} (MinMax: {minmax_status})")

# Show sample of new indicators
print("\n" + "="*80)
print("SAMPLE OF CALCULATED VALUES (first 10 rows of indicator 1000 - Marża netto):")
print("="*80)
sample = new_indicators_df[new_indicators_df['WSKAZNIK_INDEX'] == 1000].head(10)
print(sample.to_string(index=False))

# Check for any remaining NaN or inf values in new indicators
print("\n" + "="*80)
print("DATA QUALITY CHECK:")
print("="*80)
print(f"NaN values in new indicators: {new_indicators_df['wartosc'].isna().sum()}")
print(f"Inf values in new indicators: {np.isinf(new_indicators_df['wartosc']).sum()}")
print(f"NaN values in combined data: {combined_values_df['wartosc'].isna().sum()}")
print(f"Inf values in combined data: {np.isinf(combined_values_df['wartosc']).sum()}")

# Verify zero values (legitimate) vs NaN/Inf (problems)
print(f"\nZero values in new indicators: {(new_indicators_df['wartosc'] == 0).sum()}")
print("Note: Zero values are expected when division by zero occurs (e.g., no revenue)")

Indicator mapping created:
  C: 0
  CF: 1
  DEPR: 2
  EN: 3
  GS: 5
  INV: 6
  IO: 7
  IP: 8
  LTC: 9
  LTL: 10
  NOWE: 14
  NP: 15
  NWC: 16
  OFE: 17
  OP: 18
  PEN: 21
  PNPM: 22
  POS: 23
  PPO: 24
  PRACUJACY: 29
  PRZYCHFIN: 30
  REC: 31
  STC: 32
  STL: 33
  TC: 34
  UPADLOSC: 35
  ZAMKNIETE: 12
  ZAREJESTROWANE: 13
  ZAWIESZONE: 11

Missing values before imputation:
Total missing values: 0
Percentage: 0.00%

Step 1: Interpolating within same indicator and PKD group...


  matching_rows = wskaznik_dict_df[wskaznik_dict_df['WSKAZNIK'].str.contains(search_pattern, case=False, na=False)]


Missing values after interpolation: 0
Step 2: Filling remaining gaps with median by indicator and year...
Missing values after median imputation: 0

IMPUTATION COMPLETE

CALCULATING NEW INDICATORS WITH SAFE DIVISION
✓ 1000: Net Profit Margin
✓ 1001: Operating Margin
✓ 1002: Current Ratio
✓ 1003: Quick Ratio
✓ 1004: Cash Ratio
✓ 1005: Short Debt Share
✓ 1006: Long-term Debt Share
✓ 1007: Interest Coverage
✓ 1008: Financial Risk Ratio
✓ 1009: Cash Flow Margin
✓ 1010: Operating Cash Coverage
✓ 1011: Bankruptcy Rate
✓ 1012: Closure Rate
✓ 1013: Profit Firms Share
✓ 1020: Sales Profitability
✓ 1022: Cost Share Ratio
✓ 1023: Receivables Turnover
✓ 1024: Inventory Turnover
✓ 1025: Current Asset Turnover
✓ 1026: Investment Ratio
✓ 1027: Financial Revenue Share
✓ 1028: Net Firm Growth Rate
✓ 1029: Average Firm Size
✓ 1040: Amortization Ratio
✓ 1041: New Firms Rate
✓ 1042: Suspension Rate
✓ 1043: Bank Loans Ratio

CALCULATED 27 NEW INDICATORS
Division by zero handled: inf/nan values replaced wit

In [12]:
# Save the combined data back to the original files

# Save combined values (with new indicators) to kpi-value-table.csv
combined_values_df.to_csv("../results-pipeline/kpi-value-table.csv", sep=";", index=False, encoding="utf-8")
print(f"✓ Saved combined values to: ../results-pipeline/kpi-value-table.csv")
print(f"  Total rows: {len(combined_values_df)}")

# Save combined wskaznik dictionary to wskaznik_dictionary.csv
combined_wskaznik_dict.to_csv("../results-pipeline/wskaznik_dictionary.csv", sep=";", index=False, encoding="utf-8")
print(f"✓ Saved combined dictionary to: ../results-pipeline/wskaznik_dictionary.csv")
print(f"  Total indicators: {len(combined_wskaznik_dict)}")

print("\n" + "="*80)
print("FILES SUCCESSFULLY UPDATED!")
print("="*80)
print(f"Total new indicators added: {len(new_indicators)}")
print(f"Indicator range: {min([i['WSKAZNIK_INDEX'] for i in new_indicators])}-{max([i['WSKAZNIK_INDEX'] for i in new_indicators])}")
print("\nSummary by set:")
print("  • Zestaw 1 (Zdolność kredytowa): 1000-1013 (14 indicators)")
print("  • Zestaw 2 (Efektywność operacyjna): 1020-1029 (10 indicators)")
print("  • Zestaw 3 (Rozwój branży): 1040-1043 (4 indicators)")

✓ Saved combined values to: ../results-pipeline/kpi-value-table.csv
  Total rows: 1425320
✓ Saved combined dictionary to: ../results-pipeline/wskaznik_dictionary.csv
  Total indicators: 80

FILES SUCCESSFULLY UPDATED!
Total new indicators added: 27
Indicator range: 1000-1043

Summary by set:
  • Zestaw 1 (Zdolność kredytowa): 1000-1013 (14 indicators)
  • Zestaw 2 (Efektywność operacyjna): 1020-1029 (10 indicators)
  • Zestaw 3 (Rozwój branży): 1040-1043 (4 indicators)
