## Setup

In [1]:
import sys
import os
from importlib import reload

# Add project directory to path
sys.path.append(os.path.join('.'))

from etl_pipeline_new import ETLPipeline
import data_sources_config

# Reload the config module to pick up changes
reload(data_sources_config)
from data_sources_config import get_data_sources_config

print("✓ Modules loaded successfully")

✓ Modules loaded successfully


## Initialize Pipeline

In [2]:
# Initialize ETL pipeline with PKD mapping file
pipeline = ETLPipeline(
    mapping_file_path=os.path.join('..', '..', 'data', 'mapowanie_pkd.xlsx')
)

print("✓ Pipeline initialized")

✓ Pipeline initialized


## Load and Process Data Sources

All data sources are configured in `data_sources_config.py`

In [3]:
# Get data sources configuration
data_sources = get_data_sources_config()

print(f"Found {len(data_sources)} data sources to process:\n")

# Process each data source
for source_config in data_sources:
    processor_class = source_config['processor_class']
    processor = processor_class(pipeline.pkd_mapper)
    
    pipeline.add_data_source(
        processor=processor,
        name=source_config['name'],
        **source_config['kwargs']
    )

print("\n✓ All data sources processed")

Found 5 data sources to process:

Processing Upadłości (KRZ_PKD)...
  ✓ Loaded 2,538 rows
Processing Wskaźniki Finansowe...
  ✓ Loaded 418,560 rows
Processing Dane Kwartalne - Pracujący...
  ✓ Loaded 40,705 rows
Processing Dane Kwartalne - liczba firm vs działalności gospodarczych...
  ✓ Loaded 24,423 rows
Processing Dane Kwartalne - liczba firm vs działalności gospodarczych...
  ✓ Loaded 7,793 rows

✓ All data sources processed


## Combine Data

In [4]:
# Combine all data sources into one fact table
combined_data = pipeline.combine_data()

print(f"✓ Combined data: {len(combined_data):,} rows")
print(f"  Years: {combined_data['rok'].min()} - {combined_data['rok'].max()}")
print(f"  Unique indicators: {combined_data['WSKAZNIK'].nunique():,}")
print(f"  Unique PKD codes: {combined_data['pkd_2025'].nunique():,}")

combined_data.head(10)

✓ Combined data: 494,019 rows
  Years: 2005 - 2024
  Unique indicators: 34
  Unique PKD codes: 1,489


Unnamed: 0,rok,pkd_2025,WSKAZNIK,wartosc
0,2018,01.11.Z,Upadłość,6
1,2018,01.13.Z,Upadłość,4
2,2018,01.19.Z,Upadłość,2
3,2018,01.24.Z,Upadłość,1
4,2018,01.29.Z,Upadłość,1
5,2018,01.30.Z,Upadłość,1
6,2018,01.41.Z,Upadłość,3
7,2018,01.47.Z,Upadłość,2
8,2018,01.48.Z,Upadłość,1
9,2018,01.50.Z,Upadłość,9


## Build Dimension Tables

In [5]:
# Build dimension tables and map indices
(
    fact_table,
    wskaznik_dictionary,
    pkd_dictionary,
    pkd_typ_dictionary
) = pipeline.build_dictionaries(combined_data)

print("✓ Dimension tables built")

✓ Dimension tables built


## Inspect Results

In [6]:
# Fact table (KPI values)
print("=== FACT TABLE ===")
print(f"Rows: {len(fact_table):,}")
print(f"Columns: {list(fact_table.columns)}")
display(fact_table.head(10))

=== FACT TABLE ===
Rows: 484,257
Columns: ['rok', 'wartosc', 'WSKAZNIK_INDEX', 'PKD_INDEX']


Unnamed: 0,rok,wartosc,WSKAZNIK_INDEX,PKD_INDEX
0,2018,6,33,3.0
1,2018,4,33,7.0
2,2018,2,33,15.0
3,2018,1,33,24.0
4,2018,1,33,34.0
5,2018,1,33,37.0
6,2018,3,33,40.0
7,2018,2,33,52.0
8,2018,1,33,54.0
9,2018,9,33,57.0


In [7]:
# WSKAZNIK dimension
print("=== WSKAZNIK DIMENSION ===")
print(f"Total indicators: {len(wskaznik_dictionary):,}")
display(wskaznik_dictionary.head(20))

=== WSKAZNIK DIMENSION ===
Total indicators: 34


Unnamed: 0,WSKAZNIK_INDEX,WSKAZNIK
0,0,C Środki pieniężne i pap. wart.
1,1,CF Nadwyżka finansowa
2,2,DEPR Amortyzacja
3,3,EN Liczba jednostek gospodarczych
4,4,GS (I) Przychody netto ze sprzedaży i zrównane...
5,5,GS Przychody ogółem
6,6,INV Zapasy
7,7,IO Wartość nakładów inwestycyjnych
8,8,IP Odsetki do zapłacenia
9,9,LTC Długoterminowe kredyty bankowe


In [8]:
# PKD dimension
print("=== PKD DIMENSION ===")
print(f"Total PKD codes: {len(pkd_dictionary):,}")
display(pkd_dictionary.head(20))

=== PKD DIMENSION ===
Total PKD codes: 1,764


Unnamed: 0,PKD_INDEX,symbol,nazwa,TYP_INDEX
0,0,01,"UPRAWY ROLNE, CHÓW I HODOWLA ZWIERZĄT, ŁOWIECT...",1
1,1,01.1,Uprawy rolne inne niż wieloletnie,2
2,2,01.11,"Uprawa zbóż innych niż ryż, roślin strączkowyc...",3
3,3,01.11.Z,"Uprawa zbóż innych niż ryż, roślin strączkowyc...",4
4,4,01.12,Uprawa ryżu,3
5,5,01.12.Z,Uprawa ryżu,4
6,6,01.13,"Uprawa warzyw, włączając melony oraz uprawa ro...",3
7,7,01.13.Z,"Uprawa warzyw, włączając melony oraz uprawa ro...",4
8,8,01.14,Uprawa trzciny cukrowej,3
9,9,01.14.Z,Uprawa trzciny cukrowej,4


In [9]:
# PKD Type dimension
print("=== PKD TYPE DIMENSION ===")
print(f"Total types: {len(pkd_typ_dictionary):,}")
display(pkd_typ_dictionary)

=== PKD TYPE DIMENSION ===
Total types: 6


Unnamed: 0,TYP_INDEX,typ
0,0,SEKCJA
1,1,DZIAŁ
2,2,GRUPA
3,3,KLASA
4,4,PODKLASA
5,5,OGÓŁEM


## Data Quality Checks

In [10]:
# Check for null values
print("=== NULL VALUE ANALYSIS ===")
null_counts = fact_table.isnull().sum()
null_percentages = (null_counts / len(fact_table) * 100).round(2)

for col in fact_table.columns:
    if null_counts[col] > 0:
        print(f"{col}: {null_counts[col]:,} ({null_percentages[col]}%)")
    else:
        print(f"{col}: ✓ No nulls")

=== NULL VALUE ANALYSIS ===
rok: ✓ No nulls
wartosc: 101,940 (21.05%)
WSKAZNIK_INDEX: ✓ No nulls
PKD_INDEX: ✓ No nulls


In [11]:
# Check data types
print("=== DATA TYPES ===")
print(fact_table.dtypes)
print("\nValue types in 'wartosc' column:")
print(fact_table['wartosc'].apply(type).value_counts())

=== DATA TYPES ===
rok                 int64
wartosc            object
WSKAZNIK_INDEX      int64
PKD_INDEX         float64
dtype: object

Value types in 'wartosc' column:
wartosc
<class 'decimal.Decimal'>    320476
<class 'NoneType'>           101940
<class 'int'>                 56163
<class 'float'>                5678
Name: count, dtype: int64


## Save Results

In [12]:
# Save all tables to CSV files
pipeline.save_results(
    fact_table=fact_table,
    wskaznik_dict=wskaznik_dictionary,
    pkd_dict=pkd_dictionary,
    pkd_typ_dict=pkd_typ_dictionary,
    output_dir=os.path.join('..', '..', 'results-pipeline')
)


✓ All tables saved to ..\..\results-pipeline/
  - Fact table: 484,257 rows
  - WSKAZNIK dictionary: 34 indicators
  - PKD dictionary: 1,764 codes
  - PKD type dictionary: 6 types


## Summary

### Output Files

- **kpi-value-table.csv**: Fact table with all KPI values
- **wskaznik_dictionary.csv**: Dimension table for indicators
- **pkd_dictionary.csv**: Dimension table for PKD codes
- **pkd_typ_dictionary.csv**: Dimension table for PKD types

### Schema

**Fact Table:**
- `rok` (int): Year
- `PKD_INDEX` (int): Foreign key to pkd_dictionary
- `WSKAZNIK_INDEX` (int): Foreign key to wskaznik_dictionary
- `wartosc` (Decimal/None): KPI value

**WSKAZNIK Dictionary:**
- `WSKAZNIK_INDEX` (int): Primary key
- `WSKAZNIK` (str): Indicator name

**PKD Dictionary:**
- `PKD_INDEX` (int): Primary key
- `symbol` (str): PKD code
- `nazwa` (str): PKD name
- `TYP_INDEX` (int): Foreign key to pkd_typ_dictionary

**PKD Type Dictionary:**
- `TYP_INDEX` (int): Primary key
- `typ` (str): Type name (Sekcja, Dział, etc.)