In [1]:
import sys
from pathlib import Path

p = Path.cwd().resolve()
repo_root = next((parent for parent in [p] + list(p.parents) if (parent / ".git").exists()), None)
if repo_root is None:
    raise RuntimeError("Repo root not found. Open the repo folder in VS Code.")

sys.path.insert(0, str(repo_root))
print("Repo root:", repo_root)

Repo root: C:\Users\harri\OneDrive - Imperial College London\Year 3 Group Project\Group_Project_Y3


In [2]:
import numpy as np
import pandas as pd

**A - National Census 2021:**

Cleaning P4181 - Population by age and sex

In [5]:
pop_nc_as = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4181_pop_sex_age.csv", delimiter=";")
pop_nc_as.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;0;2021;[person],total;1;2021;[person],total;2;2021;[person],total;3;2021;[person],total;4;2021;[person],total;5;2021;[person],total;6;2021;[person],...,females;82;2021;[person],females;83;2021;[person],females;84;2021;[person],females;85;2021;[person],females;86;2021;[person],females;87;2021;[person],females;88;2021;[person],females;89;2021;[person],females;90 and more;2021;[person],Unnamed: 278
0,201000,Powiat bolesławiecki,88435,718,799,891,910,949,886,899,...,244,273,215,222,222,173,135,127,419,
1,202000,Powiat dzierżoniowski,97721,658,747,791,886,877,829,873,...,368,391,347,317,258,246,228,205,768,
2,203000,Powiat głogowski,86668,705,764,769,832,848,848,885,...,177,198,193,171,141,117,89,88,304,
3,204000,Powiat górowski,33317,275,325,297,356,307,300,320,...,73,92,87,89,79,63,69,53,175,
4,205000,Powiat jaworski,48503,397,425,413,447,457,460,458,...,126,128,118,126,106,117,82,87,285,


In [12]:
df = pop_nc_as.copy()

# Drop unnamed column
df.drop("Unnamed: 278", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["age"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "age", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/pop_nc_sex_age_p4181.csv")

Cleaning P4315 - Population aged 13 years and more by educational level and sex

In [15]:
pop_nc_se = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4315_pop_sex_ed.csv", delimiter=";")
pop_nc_se.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;tertiary;2021;[person],total;secondary and post-secondary - total;2021;[person],total;secondary and post-secondary - general secondary;2021;[person],total;secondary and post-secondary - vocational secondary;2021;[person],total;basic vocational/sectoral;2021;[person],total;lower secondary;2021;[person],total;primary completed;2021;[person],...,females;tertiary;2021;[person],females;secondary and post-secondary - total;2021;[person],females;secondary and post-secondary - general secondary;2021;[person],females;secondary and post-secondary - vocational secondary;2021;[person],females;basic vocational/sectoral;2021;[person],females;lower secondary;2021;[person],females;primary completed;2021;[person],females;primary not completed and without school education;2021;[person],females;unknown;2021;[person],Unnamed: 32
0,201000,Powiat bolesławiecki,76739,13420,26886,7994,16114,19575,2207,9772,...,8351,14231,4880,7365,8243,985,5646,1216,1154,
1,202000,Powiat dzierżoniowski,86543,14680,31483,9645,18302,19891,2415,11947,...,9126,17048,5911,8710,8811,1035,6689,1350,1765,
2,203000,Powiat głogowski,75191,15725,27891,8358,16723,17123,2175,7877,...,9655,14637,5201,7429,7037,912,4626,1247,878,
3,204000,Powiat górowski,29042,3962,9693,2563,6148,8087,1091,4556,...,2554,5162,1591,2915,3185,434,2535,487,318,
4,205000,Powiat jaworski,42493,7222,14572,4547,8554,11316,1224,5524,...,4500,7931,2830,4059,4521,499,3088,667,598,


In [None]:
df = pop_nc_se.copy()

# Drop unnamed column
df.drop("Unnamed: 32", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["education"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "education", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/pop_nc_sex_ed_p4315.csv")

**B - National Census 2021:**

Cleaning P4407 - Employed by age and sex

In [23]:
emp_sa = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4407_employed_sex_age.csv", delimiter=";")
emp_sa.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;15-24;2021;[person],total;25-34;2021;[person],total;35-44;2021;[person],total;45-54;2021;[person],total;55-64;2021;[person],total;65 and more;2021;[person],males;total;2021;[person],...,males;55-64;2021;[person],males;65 and more;2021;[person],females;total;2021;[person],females;15-24;2021;[person],females;25-34;2021;[person],females;35-44;2021;[person],females;45-54;2021;[person],females;55-64;2021;[person],females;65 and more;2021;[person],Unnamed: 23
0,201000,Powiat bolesławiecki,38792,3268,8658,10510,8577,6316,1463,20869,...,3634,863,17923,1431,4012,4946,4252,2682,600,
1,202000,Powiat dzierżoniowski,39657,3249,8023,11025,8860,6900,1600,21238,...,4037,968,18419,1483,3813,5240,4388,2863,632,
2,203000,Powiat głogowski,36209,2678,7711,11170,8054,5261,1335,19946,...,2969,804,16263,1187,3425,5022,3806,2292,531,
3,204000,Powiat górowski,13765,1385,2859,3567,3157,2337,460,7834,...,1416,309,5931,565,1206,1617,1471,921,151,
4,205000,Powiat jaworski,20715,1600,4232,5719,4596,3768,800,11289,...,2220,483,9426,713,1886,2711,2251,1548,317,


In [27]:
df = emp_sa.copy()

# Drop unnamed column
df.drop("Unnamed: 23", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["age"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "age", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/emp_nc_sex_age_p4407.csv")

Cleaning P4303 - Employed by sex and education

In [29]:
emp_se = pd.read_csv(repo_root / "raw/03_01_population_data/03_01_census_p4303_employed_sex_ed.csv", delimiter=";")
emp_se.head()

Unnamed: 0,Code,Name,total;total;2021;[person],total;tertiary;2021;[person],total;secondary and post-secondary - total;2021;[person],total;basic vocational/sectoral;2021;[person],"total;lower secondary, primary, primary not completed and without school education;2021;[person]",males;total;2021;[person],males;tertiary;2021;[person],males;secondary and post-secondary - total;2021;[person],males;basic vocational/sectoral;2021;[person],"males;lower secondary, primary, primary not completed and without school education;2021;[person]",females;total;2021;[person],females;tertiary;2021;[person],females;secondary and post-secondary - total;2021;[person],females;basic vocational/sectoral;2021;[person],"females;lower secondary, primary, primary not completed and without school education;2021;[person]",Unnamed: 17
0,201000,Powiat bolesławiecki,38792,9881,15604,9966,2712,20869,3796,8414,6613,1645,17923,6085,7190,3353,1067,
1,202000,Powiat dzierżoniowski,39657,10204,16804,8663,3253,21238,3837,8992,5807,2137,18419,6367,7812,2856,1116,
2,203000,Powiat głogowski,36209,11926,15055,7129,1688,19946,4777,8697,5093,1086,16263,7149,6358,2036,602,
3,204000,Powiat górowski,13765,2875,5454,4083,1222,7834,1037,3008,2879,826,5931,1838,2446,1204,396,
4,205000,Powiat jaworski,20715,5344,8030,5648,1421,11289,2051,4278,3862,919,9426,3293,3752,1786,502,


In [33]:
df = emp_se.copy()

# Drop unnamed column
df.drop("Unnamed: 17", axis=1, inplace=True)

# Melt to long format
id_cols = ["Code", "Name"]
long = df.melt(
    id_vars=id_cols, 
    var_name="measure",
    value_name="count"
)

# Split measure into parts
parts = long["measure"].str.split(";", expand=True)
long["sex"] = parts[0].str.strip()
long["education"] = parts[1].str.strip()
long["year"] = pd.to_numeric(parts[2]).astype("Int64")

out = long.rename(
        columns={"Code": "code", "Name": "powiat"}
    ).loc[
        :, ["code", "powiat", "year", "sex", "education", "count"]
    ]

out.to_csv(repo_root / "cleaned/03_01_outcome_data/emp_nc_sex_ed_p4303.csv")