In [7]:
import os
import re
import pandas as pd
import numpy as np

# Load the KPI value table
values_df = pd.read_csv("../results-pipeline/kpi-value-table.csv", sep=";", encoding="utf-8")

# Load the wskaznik (indicator) dictionary
wskaznik_dict_df = pd.read_csv("../results-pipeline/wskaznik_dictionary.csv", sep=";", encoding="utf-8")

# Display basic info about the loaded data
print(f"Values DataFrame shape: {values_df.shape}")
print(f"Wskaznik Dictionary shape: {wskaznik_dict_df.shape}")
print("\nFirst few rows of values_df:")
print(values_df.head())
print("\nFirst few rows of wskaznik_dict_df:")
print(wskaznik_dict_df.head())

values_df.dtypes

Values DataFrame shape: (629957, 4)
Wskaznik Dictionary shape: (38, 2)

First few rows of values_df:
    rok  wartosc  WSKAZNIK_INDEX  PKD_INDEX
0  2005  1828.70               0        0.0
1  2006  2256.83               0        0.0
2  2007  2027.13               0        0.0
3  2008  1978.48               0        0.0
4  2009  2061.13               0        0.0

First few rows of wskaznik_dict_df:
   WSKAZNIK_INDEX                                           WSKAZNIK
0               0                   C Środki pieniężne i pap. wart. 
1               1                             CF Nadwyżka finansowa 
2               2                                  DEPR Amortyzacja 
3               3                 EN Liczba jednostek gospodarczych 
4               4  GS (I) Przychody netto ze sprzedaży i zrównane...


rok                 int64
wartosc           float64
WSKAZNIK_INDEX      int64
PKD_INDEX         float64
dtype: object

In [8]:
indicator_mapping = {
    'C': 0,      # Środki pieniężne i pap. wart.
    'CF': 1,     # Nadwyżka finansowa
    'DEPR': 2,   # Amortyzacja
    'EN': 3,     # Liczba jednostek gospodarczych
    'GS': 5,     # Przychody ogółem
    'INV': 6,    # Zapasy
    'IO': 7,     # Wartość nakładów inwestycyjnych
    'IP': 8,     # Odsetki do zapłacenia
    'LTC': 9,    # Długoterminowe kredyty bankowe
    'LTL': 10,   # Zobowiązania długoterminowe
    'NP': 11,    # Wynik finansowy netto (zysk netto)
    'NWC': 12,   # Kapitał obrotowy
    'OFE': 13,   # Pozostałe koszty finansowe
    'OP': 14,    # Wynik na działalności operacyjnej
    'PEN': 15,   # Liczba rentownych jednostek gospodarczych
    'PNPM': 16,  # Przychody netto
    'POS': 17,   # Wynik na sprzedaży
    'PPO': 18,   # Pozostałe przychody operacyjne
    'REC': 25,   # Należności krótkoterminowe (WAS 20, NOW 25!)
    'STC': 26,   # Krótkoterminowe kredyty bankowe (WAS 21, NOW 26!)
    'STL': 27,   # Zobowiązania krótkoterminowe (WAS 22, NOW 27!)
    'TC': 28     # Koszty ogółem (WAS 23, NOW 28!)
}

# Check missing values before imputation
print("Missing values before imputation:")
print(f"Total missing values: {values_df['wartosc'].isna().sum()}")
print(f"Percentage: {values_df['wartosc'].isna().sum() / len(values_df) * 100:.2f}%\n")

# Create a copy for imputation
values_imputed = values_df.copy()

# Step 1: Interpolate missing values within same WSKAZNIK_INDEX and PKD_INDEX group
print("Step 1: Interpolating within same indicator and PKD group...")
values_imputed = values_imputed.sort_values(['WSKAZNIK_INDEX', 'PKD_INDEX', 'rok'])

# Apply interpolation for each group
values_imputed['wartosc'] = values_imputed.groupby(['WSKAZNIK_INDEX', 'PKD_INDEX'])['wartosc'].transform(
    lambda group: group.interpolate(method='linear', limit_direction='both')
)

print(f"Missing values after interpolation: {values_imputed['wartosc'].isna().sum()}")

# Step 2: Fill remaining missing values with median for that WSKAZNIK_INDEX in that year
print("Step 2: Filling remaining gaps with median by indicator and year...")

# Calculate median for each WSKAZNIK_INDEX and year
median_by_indicator_year = values_imputed.groupby(['WSKAZNIK_INDEX', 'rok'])['wartosc'].transform('median')

# Fill remaining NaN values with the median
values_imputed['wartosc'] = values_imputed['wartosc'].fillna(median_by_indicator_year)

print(f"Missing values after median imputation: {values_imputed['wartosc'].isna().sum()}")

# Step 3: If still any NaN (edge cases), fill with overall median for that indicator
if values_imputed['wartosc'].isna().sum() > 0:
    print("Step 3: Filling final gaps with overall indicator median...")
    median_by_indicator = values_imputed.groupby('WSKAZNIK_INDEX')['wartosc'].transform('median')
    values_imputed['wartosc'] = values_imputed['wartosc'].fillna(median_by_indicator)
    print(f"Missing values after overall median: {values_imputed['wartosc'].isna().sum()}")

# Step 4: As a last resort, fill any remaining NaN with 0
if values_imputed['wartosc'].isna().sum() > 0:
    print("Step 4: Filling any final remaining gaps with 0...")
    values_imputed['wartosc'] = values_imputed['wartosc'].fillna(0)
    print(f"Final missing values: {values_imputed['wartosc'].isna().sum()}")

print("\n" + "="*80)
print("IMPUTATION COMPLETE")
print("="*80)

# Update values_df with imputed values
values_df = values_imputed.copy()

# Pivot the data to have indicators as columns for easier calculation
pivot_df = values_df.pivot_table(
    index=['rok', 'PKD_INDEX'],
    columns='WSKAZNIK_INDEX',
    values='wartosc',
    aggfunc='first'
).reset_index()

# Replace the calculation section with this:

# Define the 8 new indicators starting from index 1000
new_indicators = []

# Helper function for safe division
def safe_divide(numerator, denominator, fill_value=0):
    """
    Safely divide, replacing inf/nan with fill_value.
    """
    result = numerator / denominator
    # Replace inf and nan
    result = result.replace([np.inf, -np.inf], fill_value)
    result = result.fillna(fill_value)
    return result

# 1. Marża netto = NP/PNPM (index 1000)
pivot_df['indicator_1000'] = safe_divide(
    pivot_df[indicator_mapping['NP']], 
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1000, 'WSKAZNIK': 'Marża netto (NP/PNPM)'})

# 2. Marża operacyjna = OP/PNPM (index 1001)
pivot_df['indicator_1001'] = safe_divide(
    pivot_df[indicator_mapping['OP']], 
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1001, 'WSKAZNIK': 'Marża operacyjna (OP/PNPM)'})

# 3. Wskaźnik bieżącej płynności = (C+REC+INV)/STL (index 1002)
pivot_df['indicator_1002'] = safe_divide(
    pivot_df[indicator_mapping['C']] + 
    pivot_df[indicator_mapping['REC']] + 
    pivot_df[indicator_mapping['INV']],
    pivot_df[indicator_mapping['STL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1002, 'WSKAZNIK': 'Wskaźnik bieżącej płynności ((C+REC+INV)/STL)'})

# 4. Wskaźnik szybki = (C+REC)/STL (index 1003)
pivot_df['indicator_1003'] = safe_divide(
    pivot_df[indicator_mapping['C']] + 
    pivot_df[indicator_mapping['REC']],
    pivot_df[indicator_mapping['STL']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1003, 'WSKAZNIK': 'Wskaźnik szybki ((C+REC)/STL)'})

# 5. Wskaźnik zadłużenia = (STL+LTL)/PNPM (index 1004)
pivot_df['indicator_1004'] = safe_divide(
    pivot_df[indicator_mapping['STL']] + 
    pivot_df[indicator_mapping['LTL']],
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1004, 'WSKAZNIK': 'Wskaźnik zadłużenia ((STL+LTL)/PNPM)'})

# 6. Pokrycie odsetek = OP/IP (index 1005)
pivot_df['indicator_1005'] = safe_divide(
    pivot_df[indicator_mapping['OP']], 
    pivot_df[indicator_mapping['IP']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1005, 'WSKAZNIK': 'Pokrycie odsetek (OP/IP)'})

# 7. Rotacja należności = PNPM/REC (index 1006)
pivot_df['indicator_1006'] = safe_divide(
    pivot_df[indicator_mapping['PNPM']], 
    pivot_df[indicator_mapping['REC']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1006, 'WSKAZNIK': 'Rotacja należności (PNPM/REC)'})

# 8. Cash flow margin = CF/PNPM (index 1007)
pivot_df['indicator_1007'] = safe_divide(
    pivot_df[indicator_mapping['CF']], 
    pivot_df[indicator_mapping['PNPM']]
)
new_indicators.append({'WSKAZNIK_INDEX': 1007, 'WSKAZNIK': 'Cash flow margin (CF/PNPM)'})

print("\n" + "="*80)
print("SAFE DIVISION APPLIED")
print("="*80)
print("Division by zero handled: inf/nan values replaced with 0")

# Convert back to long format
new_indicators_data = []

for indicator_idx in range(1000, 1008):
    col_name = f'indicator_{indicator_idx}'
    temp_df = pivot_df[['rok', 'PKD_INDEX', col_name]].copy()
    temp_df.columns = ['rok', 'PKD_INDEX', 'wartosc']
    temp_df['WSKAZNIK_INDEX'] = indicator_idx
    new_indicators_data.append(temp_df)

# Combine all new indicators
new_indicators_df = pd.concat(new_indicators_data, ignore_index=True)

# Select only the columns we need
new_indicators_df = new_indicators_df[['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']]

# Combine with original data (now imputed)
combined_values_df = pd.concat([values_df[['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']], 
                                new_indicators_df], ignore_index=True)

# Update the wskaznik dictionary
new_wskaznik_dict = pd.DataFrame(new_indicators)
combined_wskaznik_dict = pd.concat([wskaznik_dict_df, new_wskaznik_dict], ignore_index=True)

# Display summary
print(f"\nOriginal values_df shape: {len(values_df)}")
print(f"New indicators added: {len(new_indicators_df)}")
print(f"Combined values_df shape: {combined_values_df.shape}")
print(f"\nOriginal wskaznik dictionary entries: {len(wskaznik_dict_df)}")
print(f"New wskaznik dictionary entries: {len(combined_wskaznik_dict)}")
print("\n" + "="*80)
print("NEW INDICATORS ADDED:")
print("="*80)
for idx, row in new_wskaznik_dict.iterrows():
    print(f"Index {row['WSKAZNIK_INDEX']}: {row['WSKAZNIK']}")

# Show sample of new indicators
print("\n" + "="*80)
print("SAMPLE OF CALCULATED VALUES (first 10 rows of indicator 1000 - Marża netto):")
print("="*80)
sample = new_indicators_df[new_indicators_df['WSKAZNIK_INDEX'] == 1000].head(10)
print(sample.to_string(index=False))

# Check for any remaining NaN or inf values in new indicators
print("\n" + "="*80)
print("DATA QUALITY CHECK:")
print("="*80)
print(f"NaN values in new indicators: {new_indicators_df['wartosc'].isna().sum()}")
print(f"Inf values in new indicators: {np.isinf(new_indicators_df['wartosc']).sum()}")
print(f"NaN values in combined data: {combined_values_df['wartosc'].isna().sum()}")
print(f"Inf values in combined data: {np.isinf(combined_values_df['wartosc']).sum()}")

Missing values before imputation:
Total missing values: 0
Percentage: 0.00%

Step 1: Interpolating within same indicator and PKD group...
Missing values after interpolation: 0
Step 2: Filling remaining gaps with median by indicator and year...
Missing values after median imputation: 0

IMPUTATION COMPLETE

SAFE DIVISION APPLIED
Division by zero handled: inf/nan values replaced with 0

Original values_df shape: 629957
New indicators added: 173784
Combined values_df shape: (803741, 4)

Original wskaznik dictionary entries: 38
New wskaznik dictionary entries: 46

NEW INDICATORS ADDED:
Index 1000: Marża netto (NP/PNPM)
Index 1001: Marża operacyjna (OP/PNPM)
Index 1002: Wskaźnik bieżącej płynności ((C+REC+INV)/STL)
Index 1003: Wskaźnik szybki ((C+REC)/STL)
Index 1004: Wskaźnik zadłużenia ((STL+LTL)/PNPM)
Index 1005: Pokrycie odsetek (OP/IP)
Index 1006: Rotacja należności (PNPM/REC)
Index 1007: Cash flow margin (CF/PNPM)

SAMPLE OF CALCULATED VALUES (first 10 rows of indicator 1000 - Marża n

In [9]:
# Save the combined data back to the original files

# Save combined values (with new indicators) to kpi-value-table.csv
combined_values_df.to_csv("../results-pipeline/kpi-value-table.csv", sep=";", index=False, encoding="utf-8")
print(f"✓ Saved combined values to: ../results-pipeline/kpi-value-table.csv")
print(f"  Total rows: {len(combined_values_df)}")

# Save combined wskaznik dictionary to wskaznik_dictionary.csv
combined_wskaznik_dict.to_csv("../results-pipeline/wskaznik_dictionary.csv", sep=";", index=False, encoding="utf-8")
print(f"✓ Saved combined dictionary to: ../results-pipeline/wskaznik_dictionary.csv")
print(f"  Total indicators: {len(combined_wskaznik_dict)}")

print("\n" + "="*80)
print("FILES SUCCESSFULLY UPDATED!")
print("="*80)
print(f"New indicators (1000-1007) have been added to both files.")

✓ Saved combined values to: ../results-pipeline/kpi-value-table.csv
  Total rows: 803741
✓ Saved combined dictionary to: ../results-pipeline/wskaznik_dictionary.csv
  Total indicators: 46

FILES SUCCESSFULLY UPDATED!
New indicators (1000-1007) have been added to both files.
