In [1]:
import sys
from pathlib import Path

p = Path.cwd().resolve()
repo_root = next((parent for parent in [p] + list(p.parents) if (parent / ".git").exists()), None)
if repo_root is None:
    raise RuntimeError("Repo root not found. Open the repo folder in VS Code.")

sys.path.insert(0, str(repo_root))
print("Repo root:", repo_root)

Repo root: C:\Users\harri\OneDrive - Imperial College London\Year 3 Group Project\Group_Project_Y3


In [2]:
import numpy as np
import pandas as pd

**A - National Census 2021:**

Cleaning P4181 - Population by age and sex

In [5]:
pop_nc_as = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4181_pop_sex_age.csv", delimiter=";")
pop_nc_as.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;0;2021;[person],total;1;2021;[person],total;2;2021;[person],total;3;2021;[person],total;4;2021;[person],total;5;2021;[person],total;6;2021;[person],...,females;82;2021;[person],females;83;2021;[person],females;84;2021;[person],females;85;2021;[person],females;86;2021;[person],females;87;2021;[person],females;88;2021;[person],females;89;2021;[person],females;90 and more;2021;[person],Unnamed: 278
0,201000,Powiat bolesławiecki,88435,718,799,891,910,949,886,899,...,244,273,215,222,222,173,135,127,419,
1,202000,Powiat dzierżoniowski,97721,658,747,791,886,877,829,873,...,368,391,347,317,258,246,228,205,768,
2,203000,Powiat głogowski,86668,705,764,769,832,848,848,885,...,177,198,193,171,141,117,89,88,304,
3,204000,Powiat górowski,33317,275,325,297,356,307,300,320,...,73,92,87,89,79,63,69,53,175,
4,205000,Powiat jaworski,48503,397,425,413,447,457,460,458,...,126,128,118,126,106,117,82,87,285,


In [12]:
df = pop_nc_as.copy()

# Drop unnamed column
df.drop("Unnamed: 278", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["age"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "age", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/pop_nc_sex_age_p4181.csv")

Cleaning P4315 - Population aged 13 years and more by educational level and sex

In [15]:
pop_nc_se = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4315_pop_sex_ed.csv", delimiter=";")
pop_nc_se.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;tertiary;2021;[person],total;secondary and post-secondary - total;2021;[person],total;secondary and post-secondary - general secondary;2021;[person],total;secondary and post-secondary - vocational secondary;2021;[person],total;basic vocational/sectoral;2021;[person],total;lower secondary;2021;[person],total;primary completed;2021;[person],...,females;tertiary;2021;[person],females;secondary and post-secondary - total;2021;[person],females;secondary and post-secondary - general secondary;2021;[person],females;secondary and post-secondary - vocational secondary;2021;[person],females;basic vocational/sectoral;2021;[person],females;lower secondary;2021;[person],females;primary completed;2021;[person],females;primary not completed and without school education;2021;[person],females;unknown;2021;[person],Unnamed: 32
0,201000,Powiat bolesławiecki,76739,13420,26886,7994,16114,19575,2207,9772,...,8351,14231,4880,7365,8243,985,5646,1216,1154,
1,202000,Powiat dzierżoniowski,86543,14680,31483,9645,18302,19891,2415,11947,...,9126,17048,5911,8710,8811,1035,6689,1350,1765,
2,203000,Powiat głogowski,75191,15725,27891,8358,16723,17123,2175,7877,...,9655,14637,5201,7429,7037,912,4626,1247,878,
3,204000,Powiat górowski,29042,3962,9693,2563,6148,8087,1091,4556,...,2554,5162,1591,2915,3185,434,2535,487,318,
4,205000,Powiat jaworski,42493,7222,14572,4547,8554,11316,1224,5524,...,4500,7931,2830,4059,4521,499,3088,667,598,


In [None]:
df = pop_nc_se.copy()

# Drop unnamed column
df.drop("Unnamed: 32", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["education"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "education", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/pop_nc_sex_ed_p4315.csv")

**B - National Census 2021:**

Cleaning P4407 - Employed by age and sex

In [23]:
emp_sa = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4407_employed_sex_age.csv", delimiter=";")
emp_sa.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;15-24;2021;[person],total;25-34;2021;[person],total;35-44;2021;[person],total;45-54;2021;[person],total;55-64;2021;[person],total;65 and more;2021;[person],males;total;2021;[person],...,males;55-64;2021;[person],males;65 and more;2021;[person],females;total;2021;[person],females;15-24;2021;[person],females;25-34;2021;[person],females;35-44;2021;[person],females;45-54;2021;[person],females;55-64;2021;[person],females;65 and more;2021;[person],Unnamed: 23
0,201000,Powiat bolesławiecki,38792,3268,8658,10510,8577,6316,1463,20869,...,3634,863,17923,1431,4012,4946,4252,2682,600,
1,202000,Powiat dzierżoniowski,39657,3249,8023,11025,8860,6900,1600,21238,...,4037,968,18419,1483,3813,5240,4388,2863,632,
2,203000,Powiat głogowski,36209,2678,7711,11170,8054,5261,1335,19946,...,2969,804,16263,1187,3425,5022,3806,2292,531,
3,204000,Powiat górowski,13765,1385,2859,3567,3157,2337,460,7834,...,1416,309,5931,565,1206,1617,1471,921,151,
4,205000,Powiat jaworski,20715,1600,4232,5719,4596,3768,800,11289,...,2220,483,9426,713,1886,2711,2251,1548,317,


In [27]:
df = emp_sa.copy()

# Drop unnamed column
df.drop("Unnamed: 23", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["age"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "age", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/emp_nc_sex_age_p4407.csv")

Cleaning P4303 - Employed by sex and education

In [29]:
emp_se = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4303_employed_sex_ed.csv", delimiter=";")
emp_se.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;tertiary;2021;[person],total;secondary and post-secondary - total;2021;[person],total;basic vocational/sectoral;2021;[person],"total;lower secondary, primary, primary not completed and without school education;2021;[person]",males;total;2021;[person],males;tertiary;2021;[person],males;secondary and post-secondary - total;2021;[person],males;basic vocational/sectoral;2021;[person],"males;lower secondary, primary, primary not completed and without school education;2021;[person]",females;total;2021;[person],females;tertiary;2021;[person],females;secondary and post-secondary - total;2021;[person],females;basic vocational/sectoral;2021;[person],"females;lower secondary, primary, primary not completed and without school education;2021;[person]",Unnamed: 17
0,201000,Powiat bolesławiecki,38792,9881,15604,9966,2712,20869,3796,8414,6613,1645,17923,6085,7190,3353,1067,
1,202000,Powiat dzierżoniowski,39657,10204,16804,8663,3253,21238,3837,8992,5807,2137,18419,6367,7812,2856,1116,
2,203000,Powiat głogowski,36209,11926,15055,7129,1688,19946,4777,8697,5093,1086,16263,7149,6358,2036,602,
3,204000,Powiat górowski,13765,2875,5454,4083,1222,7834,1037,3008,2879,826,5931,1838,2446,1204,396,
4,205000,Powiat jaworski,20715,5344,8030,5648,1421,11289,2051,4278,3862,919,9426,3293,3752,1786,502,


In [33]:
df = emp_se.copy()

# Drop unnamed column
df.drop("Unnamed: 17", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["education"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "education", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/emp_nc_sex_ed_p4303.csv")

**C - Population by sex-age by powiat - P2137**

In [34]:
pop_sa_yr = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_pop_p2137_sex_agegr.csv", delimiter=";")
pop_sa_yr.head()

Unnamed: 0,Code,Name,total;total;1995;[person],total;total;1996;[person],total;total;1997;[person],total;total;1998;[person],total;total;1999;[person],total;total;2000;[person],total;total;2001;[person],total;total;2002;[person],...,0-14;females;2016;[person],0-14;females;2017;[person],0-14;females;2018;[person],0-14;females;2019;[person],0-14;females;2020;[person],0-14;females;2021;[person],0-14;females;2022;[person],0-14;females;2023;[person],0-14;females;2024;[person],Unnamed: 1892
0,201000,Powiat bolesławiecki,89407.0,89411.0,89596.0,89590.0,87740.0,88005.0,88121.0,88132.0,...,6543.0,6562.0,6575.0,6586.0,6598.0,6567.0,6427.0,6242.0,5988.0,
1,202000,Powiat dzierżoniowski,113810.0,113323.0,112857.0,112466.0,107810.0,107424.0,107112.0,106479.0,...,6508.0,6502.0,6477.0,6474.0,6448.0,6336.0,6174.0,5947.0,5663.0,
2,203000,Powiat głogowski,91373.0,91590.0,91656.0,91636.0,87962.0,88092.0,88181.0,87868.0,...,6835.0,6791.0,6786.0,6713.0,6661.0,6532.0,6300.0,6002.0,5720.0,
3,204000,Powiat górowski,37826.0,37781.0,37976.0,38017.0,36821.0,36924.0,36789.0,36817.0,...,2660.0,2622.0,2576.0,2533.0,2445.0,2412.0,2353.0,2278.0,2194.0,
4,205000,Powiat jaworski,54914.0,54829.0,54921.0,54818.0,53105.0,53116.0,52874.0,52634.0,...,3437.0,3486.0,3474.0,3392.0,3432.0,3361.0,3267.0,3127.0,2968.0,


In [38]:
df = pop_sa_yr.copy()

# Drop unnamed column
df.drop("Unnamed: 1892", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["age_group"] = parts[0].str.strip()
long["sex"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "age_group", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/pop_yr_sex_agegr_p2137.csv")

**D - National Census Activity Table**

Cleaning p4292 - economic activity of the population aged 15 years and more by sex and the place of residence

In [3]:
ec_ac_table = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4292_activity_table.csv", delimiter=";")
ec_ac_table.head()

Unnamed: 0,Code,Name,total;total;total;2021;[person],total;total;economically active population;2021;[person],total;total;employed;2021;[person],total;total;unemployed;2021;[person],total;total;economically inactive persons;2021;[person],total;total;unidentified status on the labour market;2021;[person],total;total;activity rate;2021;[%],total;total;employment rate;2021;[%],...,total;females;total;2021;[person],total;females;economically active population;2021;[person],total;females;employed;2021;[person],total;females;unemployed;2021;[person],total;females;economically inactive persons;2021;[person],total;females;unidentified status on the labour market;2021;[person],total;females;activity rate;2021;[%],total;females;employment rate;2021;[%],total;females;unemployment rate;2021;[%],Unnamed: 29
0,201000,Powiat bolesławiecki,74863,40280,38792,1488,28735,5848,58.4,56.2,...,38908,18598,17923,675,17553,2757,51.4,49.6,3.6,
1,202000,Powiat dzierżoniowski,84726,41553,39657,1896,34095,9078,54.9,52.4,...,44920,19229,18419,810,21278,4413,47.5,45.5,4.2,
2,203000,Powiat głogowski,73213,37825,36209,1616,30649,4739,55.2,52.9,...,37995,17168,16263,905,18545,2282,48.1,45.5,5.3,
3,204000,Powiat górowski,28309,14860,13765,1095,11769,1680,55.8,51.7,...,14336,6452,5931,521,7089,795,47.6,43.8,8.1,
4,205000,Powiat jaworski,41503,21874,20715,1159,16512,3117,57.0,54.0,...,21331,9899,9426,473,9948,1484,49.9,47.5,4.8,


In [18]:
df = ec_ac_table.copy()

# Drop unnamed column
df.drop("Unnamed: 29", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)
# long

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
# long["type"] = parts[0].str.strip()
long["sex"] = parts[1].str.strip()
long["variable"] = parts[2].str.strip()
long["year"] = pd.to_numeric(parts[3]).astype("Int64")

# Re pivot
long = long.pivot(
    columns="variable",
    index=["Code", "Name", "sex", "year"],
    values="count"
).reset_index()

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    )

# out

out.to_csv(repo_root / "cleaned/03_01_outcome_data/nc_activity_table.csv")

**E - Voivodeship Rates**

Cleaning p4108 - activity rate, and p4113 - employment rate

In [19]:
p4108 = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_lfs_vo_activity_rate_p4108.csv", delimiter=";")
p4108.head()

Unnamed: 0,Code,Name,total;total;numerical data;2010;[%],total;total;numerical data;2011;[%],total;total;numerical data;2012;[%],total;total;numerical data;2013;[%],total;total;numerical data;2014;[%],total;total;numerical data;2015;[%],total;total;numerical data;2016;[%],total;total;numerical data;2017;[%],...,50-89;total;numerical data;2016;[%],50-89;total;numerical data;2017;[%],50-89;total;numerical data;2018;[%],50-89;total;numerical data;2019;[%],50-89;total;numerical data;2020;[%],50-89;total;numerical data;2021;[%],50-89;total;numerical data;2022;[%],50-89;total;numerical data;2023;[%],50-89;total;numerical data;2024;[%],Unnamed: 122
0,200000,DOLNOŚLĄSKIE,53.9,53.3,53.8,54.0,55.2,55.5,56.8,56.4,...,34.3,33.9,33.6,33.6,33.7,35.2,35.7,36.3,35.1,
1,400000,KUJAWSKO-POMORSKIE,53.8,53.7,55.7,55.3,54.9,55.1,55.4,54.9,...,33.4,33.2,33.5,32.8,32.7,36.1,36.9,37.3,37.8,
2,600000,LUBELSKIE,52.9,53.8,54.1,53.9,54.5,54.7,53.8,53.9,...,32.8,33.1,33.0,32.6,34.0,35.7,36.7,35.2,35.9,
3,800000,LUBUSKIE,55.2,54.5,53.3,54.0,53.7,54.5,55.7,55.7,...,35.2,34.4,34.0,33.6,33.9,33.8,35.9,37.3,36.4,
4,1000000,ŁÓDZKIE,54.6,55.4,55.2,56.2,57.0,56.2,56.1,57.2,...,33.5,34.7,34.5,32.3,33.4,35.5,37.1,35.9,36.7,


In [30]:
df = p4108.copy()

# Drop unnamed column
df.drop("Unnamed: 122", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="rate"
)
# long

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["age"] = parts[0].str.strip()
# long["residence"] = parts[1].str.strip()
# long["measure"] = parts[2].str.strip()
long["year"] = pd.to_numeric(parts[3]).astype("Int64")
# long

out = long.rename(
        columns={"Code": "code", "Name": "voivodeship"}
    ).loc[:, ["code", "voivodeship", "year", "age", "rate"]]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/lfs_vo_activity_rate_p4108.csv")

out

Unnamed: 0,code,voivodeship,year,age,rate
0,200000,DOLNOŚLĄSKIE,2010,total,53.9
1,400000,KUJAWSKO-POMORSKIE,2010,total,53.8
2,600000,LUBELSKIE,2010,total,52.9
3,800000,LUBUSKIE,2010,total,55.2
4,1000000,ŁÓDZKIE,2010,total,54.6
...,...,...,...,...,...
1915,2400000,ŚLĄSKIE,2024,50-89,33.1
1916,2600000,ŚWIĘTOKRZYSKIE,2024,50-89,35.7
1917,2800000,WARMIŃSKO-MAZURSKIE,2024,50-89,36.0
1918,3000000,WIELKOPOLSKIE,2024,50-89,37.5


In [31]:
p4113 = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_lfs_vo_employ_rate_p4113.csv", delimiter=";")
p4113.head()

Unnamed: 0,Code,Name,total;total;numerical data;2010;[%],total;total;numerical data;2011;[%],total;total;numerical data;2012;[%],total;total;numerical data;2013;[%],total;total;numerical data;2014;[%],total;total;numerical data;2015;[%],total;total;numerical data;2016;[%],total;total;numerical data;2017;[%],...,60-89;total;numerical data;2016;[%],60-89;total;numerical data;2017;[%],60-89;total;numerical data;2018;[%],60-89;total;numerical data;2019;[%],60-89;total;numerical data;2020;[%],60-89;total;numerical data;2021;[%],60-89;total;numerical data;2022;[%],60-89;total;numerical data;2023;[%],60-89;total;numerical data;2024;[%],Unnamed: 197
0,200000,DOLNOŚLĄSKIE,,,,,,,,,...,,,,15.5,16.4,17.5,17.5,17.2,14.5,
1,400000,KUJAWSKO-POMORSKIE,,,,,,,,,...,,,,13.1,13.4,15.0,14.7,14.0,14.2,
2,600000,LUBELSKIE,,,,,,,,,...,,,,11.9,12.4,15.3,14.9,14.6,14.5,
3,800000,LUBUSKIE,,,,,,,,,...,,,,15.0,15.3,15.3,16.4,17.0,16.3,
4,1000000,ŁÓDZKIE,,,,,,,,,...,,,,12.5,13.5,14.9,17.1,14.8,15.8,


In [37]:
df = p4113.copy()

# Drop unnamed column
df.drop("Unnamed: 197", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="rate"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["age"] = parts[0].str.strip()
# long["residence"] = parts[1].str.strip()
# long["measure"] = parts[2].str.strip()
long["year"] = pd.to_numeric(parts[3]).astype("Int64")
long

out = long.rename(
        columns={"Code": "code", "Name": "voivodeship"}
    ).loc[:, ["code", "voivodeship", "year", "age", "rate"]]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/lfs_vo_employ_rate_p4113.csv")

out

Unnamed: 0,code,voivodeship,year,age,rate
0,200000,DOLNOŚLĄSKIE,2010,total,
1,400000,KUJAWSKO-POMORSKIE,2010,total,
2,600000,LUBELSKIE,2010,total,
3,800000,LUBUSKIE,2010,total,
4,1000000,ŁÓDZKIE,2010,total,
...,...,...,...,...,...
3115,2400000,ŚLĄSKIE,2024,60-89,12.4
3116,2600000,ŚWIĘTOKRZYSKIE,2024,60-89,12.6
3117,2800000,WARMIŃSKO-MAZURSKIE,2024,60-89,14.8
3118,3000000,WIELKOPOLSKIE,2024,60-89,14.5
