In [4]:
import os
import re
import pandas as pd
import numpy as np

# Load the KPI value table
values_df = pd.read_csv("../results-pipeline/kpi-value-table.csv", sep=";", encoding="utf-8")

# Load the wskaznik (indicator) dictionary
wskaznik_dict_df = pd.read_csv("../results-pipeline/wskaznik_dictionary.csv", sep=";", encoding="utf-8")

# Display basic info about the loaded data
print(f"Values DataFrame shape: {values_df.shape}")
print(f"Wskaznik Dictionary shape: {wskaznik_dict_df.shape}")
print("\nFirst few rows of values_df:")
print(values_df.head())
print("\nFirst few rows of wskaznik_dict_df:")
print(wskaznik_dict_df.head())

values_df.dtypes

Values DataFrame shape: (486587, 4)
Wskaznik Dictionary shape: (37, 3)

First few rows of values_df:
    rok  wartosc  WSKAZNIK_INDEX  PKD_INDEX
0  2018      6.0              35        3.0
1  2018      4.0              35        7.0
2  2018      2.0              35       15.0
3  2018      1.0              35       24.0
4  2018      1.0              35       34.0

First few rows of wskaznik_dict_df:
   WSKAZNIK_INDEX                                           WSKAZNIK MinMax
0               0                    C Środki pieniężne i pap. wart.    Max
1               1                              CF Nadwyżka finansowa    Max
2               2                                   DEPR Amortyzacja    Min
3               3                  EN Liczba jednostek gospodarczych    Max
4               4  GS (I) Przychody netto ze sprzedaży i zrównane...    Max


rok                 int64
wartosc           float64
WSKAZNIK_INDEX      int64
PKD_INDEX         float64
dtype: object

In [None]:
# Create dynamic indicator mapping based on actual wskaznik names
# This searches for indicators by their short code (e.g., 'C', 'NP', etc.)
indicator_mapping = {}

indicator_codes = {
    'C': 'C Środki pieniężne',      # Środki pieniężne i pap. wart.
    'CF': 'CF Nadwyżka finansowa',  # Nadwyżka finansowa
    'DEPR': 'DEPR Amortyzacja',     # Amortyzacja
    'EN': 'EN Liczba jednostek',    # Liczba jednostek gospodarczych
    'GS': 'GS Przychody ogółem',    # Przychody ogółem
    'INV': 'INV Zapasy',            # Zapasy
    'IO': 'IO Wartość nakładów',    # Wartość nakładów inwestycyjnych
    'IP': 'IP Odsetki',             # Odsetki do zapłacenia
    'LTC': 'LTC Długoterminowe kredyty',  # Długoterminowe kredyty bankowe
    'LTL': 'LTL Zobowiązania długoterminowe',  # Zobowiązania długoterminowe
    'NP': 'NP Wynik finansowy netto',  # Wynik finansowy netto (zysk netto)
    'NWC': 'NWC Kapitał obrotowy',  # Kapitał obrotowy
    'OFE': 'OFE Pozostałe koszty',  # Pozostałe koszty finansowe
    'OP': 'OP Wynik na działalności operacyjnej',  # Wynik na działalności operacyjnej
    'PEN': 'PEN Liczba rentownych', # Liczba rentownych jednostek gospodarczych
    'PNPM': 'PNPM Przychody netto', # Przychody netto
    'POS': 'POS Wynik na sprzedaży',# Wynik na sprzedaży
    'PPO': 'PPO Pozostałe przychody',  # Pozostałe przychody operacyjne
    'REC': 'REC Należności',        # Należności krótkoterminowe
    'STC': 'STC Krótkoterminowe kredyty',  # Krótkoterminowe kredyty bankowe
    'STL': 'STL Zobowiązania krótkoterminowe',  # Zobowiązania krótkoterminowe
    'TC': 'TC Koszty ogółem'        # Koszty ogółem
}

# Build the mapping by finding each indicator in the dictionary
for code, search_pattern in indicator_codes.items():
    matching_rows = wskaznik_dict_df[wskaznik_dict_df['WSKAZNIK'].str.contains(search_pattern, case=False, na=False)]
    if not matching_rows.empty:
        indicator_mapping[code] = matching_rows.iloc[0]['WSKAZNIK_INDEX']
    else:
        print(f"WARNING: Could not find indicator for '{code}' (searched for: '{search_pattern}')")

print("Indicator mapping created:")
for code, idx in sorted(indicator_mapping.items()):
    print(f"  {code}: {idx}")

# Check missing values before imputation
print("\nMissing values before imputation:")
print(f"Total missing values: {values_df['wartosc'].isna().sum()}")
print(f"Percentage: {values_df['wartosc'].isna().sum() / len(values_df) * 100:.2f}%\n")

# Create a copy for imputation
values_imputed = values_df.copy()

# Step 1: Interpolate missing values within same WSKAZNIK_INDEX and PKD_INDEX group
print("Step 1: Interpolating within same indicator and PKD group...")
values_imputed = values_imputed.sort_values(['WSKAZNIK_INDEX', 'PKD_INDEX', 'rok'])

# Apply interpolation for each group
values_imputed['wartosc'] = values_imputed.groupby(['WSKAZNIK_INDEX', 'PKD_INDEX'])['wartosc'].transform(
    lambda group: group.interpolate(method='linear', limit_direction='both')
)

print(f"Missing values after interpolation: {values_imputed['wartosc'].isna().sum()}")

# Step 2: Fill remaining missing values with median for that WSKAZNIK_INDEX in that year
print("Step 2: Filling remaining gaps with median by indicator and year...")

# Calculate median for each WSKAZNIK_INDEX and year
median_by_indicator_year = values_imputed.groupby(['WSKAZNIK_INDEX', 'rok'])['wartosc'].transform('median')

# Fill remaining NaN values with the median
values_imputed['wartosc'] = values_imputed['wartosc'].fillna(median_by_indicator_year)

print(f"Missing values after median imputation: {values_imputed['wartosc'].isna().sum()}")

# Step 3: If still any NaN (edge cases), fill with overall median for that indicator
if values_imputed['wartosc'].isna().sum() > 0:
    print("Step 3: Filling final gaps with overall indicator median...")
    median_by_indicator = values_imputed.groupby('WSKAZNIK_INDEX')['wartosc'].transform('median')
    values_imputed['wartosc'] = values_imputed['wartosc'].fillna(median_by_indicator)
    print(f"Missing values after overall median: {values_imputed['wartosc'].isna().sum()}")

# Step 4: As a last resort, fill any remaining NaN with 0
if values_imputed['wartosc'].isna().sum() > 0:
    print("Step 4: Filling any final remaining gaps with 0...")
    values_imputed['wartosc'] = values_imputed['wartosc'].fillna(0)
    print(f"Final missing values: {values_imputed['wartosc'].isna().sum()}")

print("\n" + "="*80)
print("IMPUTATION COMPLETE")
print("="*80)

# Update values_df with imputed values
values_df = values_imputed.copy()

# Pivot the data to have indicators as columns for easier calculation
pivot_df = values_df.pivot_table(
    index=['rok', 'PKD_INDEX'],
    columns='WSKAZNIK_INDEX',
    values='wartosc',
    aggfunc='mean'  # Use 'mean' to handle any potential duplicates
).reset_index()

# Define the 8 new indicators starting from index 1000
new_indicators = []

# Helper function for safe division
def safe_divide(numerator, denominator, fill_value=0):
    """
    Safely divide, replacing inf/nan with fill_value.
    """
    result = numerator / denominator
    # Replace inf and nan
    result = result.replace([np.inf, -np.inf], fill_value)
    result = result.fillna(fill_value)
    return result

print("\n" + "="*80)
print("CALCULATING NEW INDICATORS WITH SAFE DIVISION")
print("="*80)

# Helper function to get MinMax from wskaznik_dictionary
def get_minmax(indicator_code):
    """Get MinMax value for an indicator from the dictionary"""
    idx = indicator_mapping.get(indicator_code)
    if idx is not None:
        minmax_row = wskaznik_dict_df[wskaznik_dict_df['WSKAZNIK_INDEX'] == idx]
        if not minmax_row.empty:
            return minmax_row.iloc[0].get('MinMax', 'Max')
    return 'Max'  # Default to Max if not found

# 1. Marża netto = NP/PNPM (index 1000)
# NP is Max, PNPM is Max -> ratio where higher numerator (profit) is better -> Max
pivot_df['indicator_1000'] = safe_divide(
    pivot_df[indicator_mapping['NP']], 
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1000, 'WSKAZNIK': 'Marża netto (NP/PNPM)', 'MinMax': get_minmax('NP')})
print("✓ Calculated indicator 1000: Marża netto")

# 2. Marża operacyjna = OP/PNPM (index 1001)
# OP is Max, PNPM is Max -> ratio where higher numerator (operating profit) is better -> Max
pivot_df['indicator_1001'] = safe_divide(
    pivot_df[indicator_mapping['OP']], 
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1001, 'WSKAZNIK': 'Marża operacyjna (OP/PNPM)', 'MinMax': get_minmax('OP')})
print("✓ Calculated indicator 1001: Marża operacyjna")

# 3. Wskaźnik bieżącej płynności = (C+REC+INV)/STL (index 1002)
# C is Max, REC is Min, INV is Min, STL is Min -> higher ratio means better liquidity -> Max
pivot_df['indicator_1002'] = safe_divide(
    pivot_df[indicator_mapping['C']] + 
    pivot_df[indicator_mapping['REC']] + 
    pivot_df[indicator_mapping['INV']],
    pivot_df[indicator_mapping['STL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1002, 'WSKAZNIK': 'Wskaźnik bieżącej płynności ((C+REC+INV)/STL)', 'MinMax': get_minmax('C')})
print("✓ Calculated indicator 1002: Wskaźnik bieżącej płynności")

# 4. Wskaźnik szybki = (C+REC)/STL (index 1003)
# C is Max, REC is Min, STL is Min -> higher ratio means better quick liquidity -> Max
pivot_df['indicator_1003'] = safe_divide(
    pivot_df[indicator_mapping['C']] + 
    pivot_df[indicator_mapping['REC']],
    pivot_df[indicator_mapping['STL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1003, 'WSKAZNIK': 'Wskaźnik szybki ((C+REC)/STL)', 'MinMax': get_minmax('C')})
print("✓ Calculated indicator 1003: Wskaźnik szybki")

# 5. Wskaźnik zadłużenia = (STL+LTL)/PNPM (index 1004)
# STL is Min, LTL is Min, PNPM is Max -> lower debt ratio is better -> Min
pivot_df['indicator_1004'] = safe_divide(
    pivot_df[indicator_mapping['STL']] + 
    pivot_df[indicator_mapping['LTL']],
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1004, 'WSKAZNIK': 'Wskaźnik zadłużenia ((STL+LTL)/PNPM)', 'MinMax': get_minmax('STL')})
print("✓ Calculated indicator 1004: Wskaźnik zadłużenia")

# 6. Pokrycie odsetek = OP/IP (index 1005)
# OP is Max, IP is Min -> higher coverage (can pay interest better) is better -> Max
pivot_df['indicator_1005'] = safe_divide(
    pivot_df[indicator_mapping['OP']], 
    pivot_df[indicator_mapping['IP']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1005, 'WSKAZNIK': 'Pokrycie odsetek (OP/IP)', 'MinMax': get_minmax('OP')})
print("✓ Calculated indicator 1005: Pokrycie odsetek")

# 7. Rotacja należności = PNPM/REC (index 1006)
# PNPM is Max, REC is Min -> higher turnover (faster collection) is better -> Max
pivot_df['indicator_1006'] = safe_divide(
    pivot_df[indicator_mapping['PNPM']], 
    pivot_df[indicator_mapping['REC']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1006, 'WSKAZNIK': 'Rotacja należności (PNPM/REC)', 'MinMax': get_minmax('PNPM')})
print("✓ Calculated indicator 1006: Rotacja należności")

# 8. Cash flow margin = CF/PNPM (index 1007)
# CF is Max, PNPM is Max -> higher cash flow margin is better -> Max
pivot_df['indicator_1007'] = safe_divide(
    pivot_df[indicator_mapping['CF']], 
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1007, 'WSKAZNIK': 'Cash flow margin (CF/PNPM)', 'MinMax': get_minmax('CF')})
print("✓ Calculated indicator 1007: Cash flow margin")

print("\n" + "="*80)
print("SAFE DIVISION APPLIED TO ALL INDICATORS")
print("="*80)
print("Division by zero handled: inf/nan values replaced with 0")

# Convert back to long format
new_indicators_data = []

for indicator_idx in range(1000, 1008):
    col_name = f'indicator_{indicator_idx}'
    temp_df = pivot_df[['rok', 'PKD_INDEX', col_name]].copy()
    temp_df.columns = ['rok', 'PKD_INDEX', 'wartosc']
    temp_df['WSKAZNIK_INDEX'] = indicator_idx
    new_indicators_data.append(temp_df)

# Combine all new indicators
new_indicators_df = pd.concat(new_indicators_data, ignore_index=True)

# Select only the columns we need
new_indicators_df = new_indicators_df[['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']]

# Combine with original data (now imputed)
combined_values_df = pd.concat([values_df[['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']], 
                                new_indicators_df], ignore_index=True)

# Update the wskaznik dictionary
new_wskaznik_dict = pd.DataFrame(new_indicators)
combined_wskaznik_dict = pd.concat([wskaznik_dict_df, new_wskaznik_dict], ignore_index=True)

# Display summary
print(f"\nOriginal values_df shape: {len(values_df)}")
print(f"New indicators added: {len(new_indicators_df)}")
print(f"Combined values_df shape: {combined_values_df.shape}")
print(f"\nOriginal wskaznik dictionary entries: {len(wskaznik_dict_df)}")
print(f"New wskaznik dictionary entries: {len(combined_wskaznik_dict)}")
print("\n" + "="*80)
print("NEW INDICATORS ADDED:")
print("="*80)
for idx, row in new_wskaznik_dict.iterrows():
    minmax_status = row.get('MinMax', 'N/A')
    print(f"Index {row['WSKAZNIK_INDEX']}: {row['WSKAZNIK']} (MinMax: {minmax_status})")

# Show sample of new indicators
print("\n" + "="*80)
print("SAMPLE OF CALCULATED VALUES (first 10 rows of indicator 1000 - Marża netto):")
print("="*80)
sample = new_indicators_df[new_indicators_df['WSKAZNIK_INDEX'] == 1000].head(10)
print(sample.to_string(index=False))

# Check for any remaining NaN or inf values in new indicators
print("\n" + "="*80)
print("DATA QUALITY CHECK:")
print("="*80)
print(f"NaN values in new indicators: {new_indicators_df['wartosc'].isna().sum()}")
print(f"Inf values in new indicators: {np.isinf(new_indicators_df['wartosc']).sum()}")
print(f"NaN values in combined data: {combined_values_df['wartosc'].isna().sum()}")
print(f"Inf values in combined data: {np.isinf(combined_values_df['wartosc']).sum()}")

# Verify zero values (legitimate) vs NaN/Inf (problems)
print(f"\nZero values in new indicators: {(new_indicators_df['wartosc'] == 0).sum()}")
print("Note: Zero values are expected when division by zero occurs (e.g., no revenue)")

Indicator mapping created:
  C: 0
  CF: 1
  DEPR: 2
  EN: 3
  GS: 5
  INV: 6
  IO: 7
  IP: 8
  LTC: 9
  LTL: 10
  NP: 15
  NWC: 16
  OFE: 17
  OP: 18
  PEN: 21
  PNPM: 22
  POS: 23
  PPO: 24
  REC: 31
  STC: 32
  STL: 33
  TC: 34

Missing values before imputation:
Total missing values: 101940
Percentage: 20.95%

Step 1: Interpolating within same indicator and PKD group...
Missing values after interpolation: 39156
Step 2: Filling remaining gaps with median by indicator and year...
Missing values after median imputation: 0

IMPUTATION COMPLETE
Missing values after interpolation: 39156
Step 2: Filling remaining gaps with median by indicator and year...
Missing values after median imputation: 0

IMPUTATION COMPLETE

CALCULATING NEW INDICATORS WITH SAFE DIVISION
✓ Calculated indicator 1000: Marża netto
✓ Calculated indicator 1001: Marża operacyjna
✓ Calculated indicator 1002: Wskaźnik bieżącej płynności
✓ Calculated indicator 1003: Wskaźnik szybki
✓ Calculated indicator 1004: Wskaźnik zadłu

In [6]:
# Save the combined data back to the original files

# Save combined values (with new indicators) to kpi-value-table.csv
combined_values_df.to_csv("../results-pipeline/kpi-value-table.csv", sep=";", index=False, encoding="utf-8")
print(f"✓ Saved combined values to: ../results-pipeline/kpi-value-table.csv")
print(f"  Total rows: {len(combined_values_df)}")

# Save combined wskaznik dictionary to wskaznik_dictionary.csv
combined_wskaznik_dict.to_csv("../results-pipeline/wskaznik_dictionary.csv", sep=";", index=False, encoding="utf-8")
print(f"✓ Saved combined dictionary to: ../results-pipeline/wskaznik_dictionary.csv")
print(f"  Total indicators: {len(combined_wskaznik_dict)}")

print("\n" + "="*80)
print("FILES SUCCESSFULLY UPDATED!")
print("="*80)
print(f"New indicators (1000-1007) have been added to both files.")

✓ Saved combined values to: ../results-pipeline/kpi-value-table.csv
  Total rows: 661235
✓ Saved combined dictionary to: ../results-pipeline/wskaznik_dictionary.csv
  Total indicators: 45

FILES SUCCESSFULLY UPDATED!
New indicators (1000-1007) have been added to both files.
