In [1]:
import pandas as pd
import os
import re

# Configuration
DATA_DIR = r"..\data"
RESULTS_DIR = r"..\results-pipeline"
MAPPING_FILE = os.path.join(DATA_DIR, "mapowanie_pkd.xlsx")
BANKRUPTCY_FILE = os.path.join(DATA_DIR, "krz_pkd.csv")
FINANCIAL_FILE = os.path.join(DATA_DIR, "wsk_fin.csv")

# Ensure results directory exists
os.makedirs(RESULTS_DIR, exist_ok=True)

In [2]:
def load_mappings():
    """Loads and prepares PKD mapping tables."""
    print("Loading mappings...")
    map_07_25 = pd.read_excel(MAPPING_FILE, sheet_name="MAP_PKD_2007_2025")
    pkd_25 = pd.read_excel(MAPPING_FILE, sheet_name="PKD_2025")

    # Add 'OG' (Ogółem) special case
    map_07_25_og = pd.concat([
        map_07_25,
        pd.DataFrame({'symbol_2007': ['OG'], 'symbol_2025': ['OG']})
    ], ignore_index=True)

    pkd_25_og = pd.concat([
        pkd_25,
        pd.DataFrame({'typ': ['OGÓŁEM'], 'symbol': ['OG'], 'nazwa': ['OGÓŁEM']})
    ], ignore_index=True)
    
    return map_07_25, map_07_25_og, pkd_25_og

In [3]:
def process_bankruptcy_data(map_07_25):
    """Processes bankruptcy data (KRZ_PKD)."""
    print("Processing bankruptcy data...")
    df = pd.read_csv(BANKRUPTCY_FILE, sep=";", encoding="utf-8")
    
    # Format PKD: 1234A -> 12.34.A
    df["pkd_formatted"] = (
        df["pkd"]
        .str.strip()
        .str.upper()
        .str.replace(r"^(\d{2})(\d{2})([A-Z])$", r"\1.\2.\3", regex=True)
    )
    
    # Split for mapping
    df["pkd_formatted_no_letter"] = df["pkd_formatted"].str.replace(r"\.[A-Z]$", "", regex=True)
    df["pkd_letter"] = df["pkd_formatted"].str.extract(r"\.([A-Z])$")

    # Map 2007 -> 2025
    df_mapped = df.merge(
        map_07_25[["symbol_2007", "symbol_2025"]],
        how="left",
        left_on="pkd_formatted_no_letter",
        right_on="symbol_2007"
    )
    
    # Check for unmapped
    unmapped = df_mapped[df_mapped["symbol_2025"].isna()]
    if not unmapped.empty:
        print(f"Warning: Unmapped bankruptcy codes: {unmapped['pkd_formatted_no_letter'].unique()}")
        df_mapped = df_mapped.dropna(subset=["symbol_2025"])

    # Construct final PKD 2025 code
    df_mapped["pkd_2025"] = df_mapped["symbol_2025"] + '.' + df_mapped["pkd_letter"]
    
    # Select and rename
    final = df_mapped[["rok", "pkd_2025", "liczba_upadlosci"]].copy()
    final["WSKAZNIK"] = "Upadłość"
    final = final.rename(columns={"liczba_upadlosci": "wartosc"})
    
    return final

In [4]:
def process_financial_data(map_07_25_og):
    """Processes financial indicators data."""
    print("Processing financial data...")
    df = pd.read_csv(FINANCIAL_FILE, sep=";", encoding="utf-8")
    
    # Clean PKD format
    df["PKD_formatted"] = df["PKD"].str.replace(r"^SEK_", "", regex=True).str.rstrip('.')
    df = df.drop(columns=["NAZWA_PKD", "NUMER_NAZWA_PKD", "PKD"])
    
    # Map 2007 -> 2025
    df_mapped = df.join(
        map_07_25_og[["symbol_2007", "symbol_2025"]].set_index("symbol_2007"),
        on="PKD_formatted"
    )
    
    # Check for unmapped
    unmapped = df_mapped[df_mapped["symbol_2025"].isna()]
    if not unmapped.empty:
        raise ValueError(f"Unmapped financial codes: {unmapped['PKD_formatted'].unique()}")
        
    df_mapped = df_mapped.drop(columns=["PKD_formatted"])
    
    # Pivot to long format
    df_melted = df_mapped.melt(
        id_vars=['symbol_2025', 'WSKAZNIK'],
        var_name='rok',
        value_name='wartosc'
    )
    
    df_melted['rok'] = df_melted['rok'].astype(int)
    df_melted['wartosc'] = df_melted['wartosc'].replace('bd', pd.NA)
    
    final = df_melted.rename(columns={'symbol_2025': 'pkd_2025'})
    return final

In [5]:
def normalize_pkd_code(value):
    if pd.isna(value):
        return pd.NA
    code = str(value).strip().upper()
    if code.endswith('.0'):
        code = code[:-2]
    return code

def create_dimensions_and_fact(combined_data, pkd_2025_og):
    """Creates dimension tables and links them to the fact table."""
    print("Creating dimensions...")
    
    # 1. WSKAZNIK Dimension
    unique_wskaznik = combined_data["WSKAZNIK"].dropna().sort_values().unique()
    wskaznik_dict = {value: idx for idx, value in enumerate(unique_wskaznik)}
    
    dim_wskaznik = pd.DataFrame(
        list(wskaznik_dict.items()), columns=["WSKAZNIK", "WSKAZNIK_INDEX"]
    )[["WSKAZNIK_INDEX", "WSKAZNIK"]].sort_values("WSKAZNIK_INDEX")
    
    combined_data["WSKAZNIK_INDEX"] = combined_data["WSKAZNIK"].map(wskaznik_dict)
    combined_data.drop(columns=["WSKAZNIK"], inplace=True)
    
    # 2. PKD Type Dimension
    unique_typ = pkd_2025_og["typ"].dropna().unique()
    typ_dict = {value: idx for idx, value in enumerate(unique_typ)}
    
    dim_pkd_typ = pd.DataFrame(
        list(typ_dict.items()), columns=["typ", "TYP_INDEX"]
    )[["TYP_INDEX", "typ"]].sort_values("TYP_INDEX")
    
    pkd_2025_og["TYP_INDEX"] = pkd_2025_og["typ"].map(typ_dict)
    pkd_2025_og.drop(columns=["typ"], inplace=True)
    
    # 3. PKD Dimension & Linking
    pkd_2025_og["symbol_normalized"] = pkd_2025_og["symbol"].map(normalize_pkd_code)
    combined_data["pkd_2025_normalized"] = combined_data["pkd_2025"].map(normalize_pkd_code)
    
    unique_pkd = pkd_2025_og["symbol_normalized"].dropna().sort_values().unique()
    pkd_dict = {value: idx for idx, value in enumerate(unique_pkd)}
    
    pkd_2025_og["PKD_INDEX"] = pkd_2025_og["symbol_normalized"].map(pkd_dict)
    
    dim_pkd = (
        pkd_2025_og[["PKD_INDEX", "symbol", "nazwa", "TYP_INDEX"]]
        .drop_duplicates(subset=["PKD_INDEX"])
        .sort_values("PKD_INDEX")
        .reset_index(drop=True)
    )
    
    combined_data["PKD_INDEX"] = combined_data["pkd_2025_normalized"].map(pkd_dict)
    
    # Check for unmapped PKD in fact table
    missing_pkd = combined_data[combined_data["PKD_INDEX"].isna()]
    if not missing_pkd.empty:
        print(f"Warning: Unmapped PKD codes in fact table: {missing_pkd['pkd_2025_normalized'].unique()}")
        combined_data = combined_data.dropna(subset=["PKD_INDEX"])
        
    # Cleanup fact table
    combined_data.drop(columns=["pkd_2025", "pkd_2025_normalized"], inplace=True)
    
    return combined_data, dim_wskaznik, dim_pkd, dim_pkd_typ

In [6]:
# Execution Pipeline
try:
    # 1. Load Mappings
    map_07_25, map_07_25_og, pkd_25_og = load_mappings()

    # 2. Process Source Data
    df_bankruptcy = process_bankruptcy_data(map_07_25)
    df_financial = process_financial_data(map_07_25_og)

    # 3. Combine Data
    combined_data = pd.concat([df_financial, df_bankruptcy], ignore_index=True)

    from decimal import Decimal

    def str_to_decimal(val):
        if isinstance(val, str):
            val_clean = val.replace('\xa0', '').replace(',', '.')
            return Decimal(val_clean)
        return val
    combined_data['wartosc'] = combined_data['wartosc'].apply(str_to_decimal)

    # 4. Create Dimensions and Fact Table
    fact_table, dim_wskaznik, dim_pkd, dim_pkd_typ = create_dimensions_and_fact(combined_data, pkd_25_og)

    # 5. Save Results
    print("Saving files...")
    dim_wskaznik.to_csv(os.path.join(RESULTS_DIR, "wskaznik_dictionary.csv"), sep=";", encoding="utf-8", index=False)
    dim_pkd.to_csv(os.path.join(RESULTS_DIR, "pkd_dictionary.csv"), sep=";", encoding="utf-8", index=False)
    dim_pkd_typ.to_csv(os.path.join(RESULTS_DIR, "pkd_typ_dictionary.csv"), sep=";", encoding="utf-8", index=False)
    fact_table.to_csv(os.path.join(RESULTS_DIR, "kpi-value-table.csv"), sep=";", encoding="utf-8", index=False)

    print("All tables saved successfully!")

except Exception as e:
    print(f"An error occurred: {e}")

Loading mappings...
Processing bankruptcy data...
Processing financial data...
Processing bankruptcy data...
Processing financial data...
Creating dimensions...
Creating dimensions...
 '46.21.Z' '46.85.Z' '47.76.Z' '47.79.Z' '52.26.C' '56.11.A' '56.12.A'
 '61.10.Z' '61.90.Z' '62.10.Z' '62.20.Z' '63.10.Z' '64.92.Z' '68.32.Z'
 '69.20.Z' '70.10.Z' '71.12.Z' '79.11.A' '79.11.B' '79.90.C' '81.22.Z'
 '82.99.Z' '86.92.A' '86.93.A' '86.94.A' '86.95.A' '86.96.A' '86.97.A'
 '86.92.B' '86.93.B' '86.94.B' '86.95.B' '86.96.B' '86.97.B' '86.91.D'
 '86.92.D' '86.93.D' '86.94.D' '86.95.D' '86.96.D' '86.97.D' '86.91.E'
 '86.92.E' '86.93.E' '86.94.E' '86.95.E' '86.96.E' '86.97.E' '86.99.E'
 '90.20.Z' '90.31.Z' '93.29.Z' '52.21.Z' '86.91.C' '86.92.C' '86.93.C'
 '86.94.C' '86.95.C' '86.96.C' '86.97.C' '13.10.C' '13.20.D' '56.11.B'
 '56.12.B' '77.40.Z' '13.10.A' '46.11.Z' '73.12.C' '81.23.Z' '13.20.A'
 '52.26.A' '85.33.Z' '73.12.B' '85.31.B']
Saving files...
 '46.21.Z' '46.85.Z' '47.76.Z' '47.79.Z' '52.26.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data.drop(columns=["pkd_2025", "pkd_2025_normalized"], inplace=True)


All tables saved successfully!
