In [1]:
import sys
from pathlib import Path

p = Path.cwd().resolve()
repo_root = next((parent for parent in [p] + list(p.parents) if (parent / ".git").exists()), None)
if repo_root is None:
    raise RuntimeError("Repo root not found. Open the repo folder in VS Code.")

sys.path.insert(0, str(repo_root))
print("Repo root:", repo_root)

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

Repo root: C:\Users\harri\OneDrive - Imperial College London\Year 3 Group Project\Group_Project_Y3


In [2]:
county_codes = pd.read_csv(repo_root / "cleaned/00_codes/county_codes.csv")
county_codes

Unnamed: 0,county_code,county_kts,county_name
0,201,10030210101000,Powiat bolesławiecki
1,202,10030210302000,Powiat dzierżoniowski
2,203,10030210203000,Powiat głogowski
3,204,10030210204000,Powiat górowski
4,205,10030210105000,Powiat jaworski
...,...,...,...
375,3217,10023216417000,Powiat wałecki
376,3218,10023216418000,Powiat łobeski
377,3261,10023216361000,Powiat m. Koszalin
378,3262,10023216562000,Powiat m. Szczecin


**SECTION 1 - UNEMPLOYMENT BY POWIAT**

We will take both ru measures and sum by powait (to check sum consistent). then taking one we will merge with powiat measures of economically active -> unemployment rate by powiat.

In [3]:
p1946 = pd.read_csv(repo_root / "cleaned/03_01_outcome_data/ru_sex_age_p1946.csv", index_col=0)
p1947 = pd.read_csv(repo_root / "cleaned/03_01_outcome_data/ru_sex_ed_p1947.csv", index_col=0)

# Create county_code column for each
for df in [p1946, p1947]:
    df["county_code"] = df["code"].apply(lambda x: int(str(x)[:-3]))

# Now take only sex total and age/education total
p1946 = p1946[
    (p1946["age"]=="total") & (p1946["sex"]=="total")
]

p1947 = p1947[
    (p1947["education"]=="total") & (p1947["sex"]=="total")
]

ru_series_p1946 = p1946.groupby(["county_code", "year"])["count"].sum()
ru_series_p1947 = p1947.groupby(["county_code", "year"])["count"].sum()

(ru_series_p1946 == ru_series_p1947).sum()

np.int64(9880)

Given they are the same we will use the first

In [4]:
ru_series_p1946 = ru_series_p1946.reset_index()
ru_series_p1946 = ru_series_p1946.rename(columns={"count": "ru"})
ru_series_p1946

Unnamed: 0,county_code,year,ru
0,201,2000,8886.0
1,201,2001,9218.0
2,201,2002,8574.0
3,201,2003,8803.0
4,201,2004,7966.0
...,...,...,...
9875,3263,2021,559.0
9876,3263,2022,521.0
9877,3263,2023,545.0
9878,3263,2024,585.0


Gather required columns from poplation powiat table

In [5]:
ptot = pd.read_csv(repo_root / "cleaned/03_01_outcome_tables/population_powiat.csv", index_col=0)

ptot_cols = [
    "county_code", "year",
    "county_kts", "county_name",
    "tp1_nc_pop", "tp2_yr_pop", "tp3b_nc_active", "tp4b_active", "tp5b_active", "tp6b_active"
]

ptot = ptot[ptot_cols]

Now merge the tables:

In [6]:
ru_series_p1946.set_index(["county_code", "year"], inplace=True)
ptot.set_index(["county_code", "year"], inplace=True)

ru_powiat = ru_series_p1946.join(ptot, how="left").reset_index()

# Save
ru_powiat.to_csv(repo_root / "cleaned/03_01_outcome_tables/unemploy_table.csv")

ru_powiat

Unnamed: 0,county_code,year,ru,county_kts,county_name,tp1_nc_pop,tp2_yr_pop,tp3b_nc_active,tp4b_active,tp5b_active,tp6b_active
0,201,2000,8886.0,1.003021e+13,Powiat bolesławiecki,76739.0,64323.0,40280.0,37565.0,,
1,201,2001,9218.0,1.003021e+13,Powiat bolesławiecki,76739.0,64858.0,40280.0,37877.0,,
2,201,2002,8574.0,1.003021e+13,Powiat bolesławiecki,76739.0,71744.0,40280.0,41898.0,,
3,201,2003,8803.0,1.003021e+13,Powiat bolesławiecki,76739.0,72499.0,40280.0,42339.0,,
4,201,2004,7966.0,1.003021e+13,Powiat bolesławiecki,76739.0,73078.0,40280.0,42678.0,,
...,...,...,...,...,...,...,...,...,...,...,...
9875,3263,2021,559.0,1.002322e+13,Powiat m. Świnoujście,35988.0,34960.0,17737.0,19752.0,19578.0,19752.0
9876,3263,2022,521.0,1.002322e+13,Powiat m. Świnoujście,35988.0,34619.0,17737.0,19560.0,19664.0,19839.0
9877,3263,2023,545.0,1.002322e+13,Powiat m. Świnoujście,35988.0,34317.0,17737.0,19389.0,19664.0,19839.0
9878,3263,2024,585.0,1.002322e+13,Powiat m. Świnoujście,35988.0,34148.0,17737.0,19294.0,19499.0,19673.0


In [7]:
ru_series_p1946

Unnamed: 0_level_0,Unnamed: 1_level_0,ru
county_code,year,Unnamed: 2_level_1
201,2000,8886.0
201,2001,9218.0
201,2002,8574.0
201,2003,8803.0
201,2004,7966.0
...,...,...
3263,2021,559.0
3263,2022,521.0
3263,2023,545.0
3263,2024,585.0


**SECTION 2 - UNEMPLOYMENT POWIAT SEX AGE:**

In [8]:
p1946 = pd.read_csv(repo_root / "cleaned/03_01_outcome_data/ru_sex_age_p1946.csv", index_col=0)

# create county code
p1946["county_code"] = p1946["code"].apply(lambda x: int(str(x)[:-3]))

# drop total sex and irrelevant ages
p1946 = p1946[~(p1946["sex"]=="total")].copy()
p1946 = p1946[~(p1946["age"].isin(["total", "25 and more"]))].copy()

# rename count column
p1946.rename(columns={"count": "ru", "age": "age_group"}, inplace=True)

# set up ready for join
ru_sa_table = p1946.set_index(["county_code", "year", "sex", "age_group"])[["ru"]]
ru_sa_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ru
county_code,year,sex,age_group,Unnamed: 4_level_1
201,2000,males,under 25 years,1154.0
202,2000,males,under 25 years,1265.0
203,2000,males,under 25 years,1131.0
204,2000,males,under 25 years,562.0
205,2000,males,under 25 years,708.0
...,...,...,...,...
3217,2025,females,55 and more,79.0
3218,2025,females,55 and more,75.0
3261,2025,females,55 and more,146.0
3262,2025,females,55 and more,343.0


In [9]:
psa = pd.read_csv(repo_root / "cleaned/03_01_outcome_tables/population_powiat_sex_age.csv", index_col=0)

psa

Unnamed: 0,county_code,year,sex,age_group,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active
0,201,1995,females,under 25 years,4001,6879.0,2430.0,2522.0,,
1,201,1995,females,25-34,5751,5740.0,4129.0,4285.0,,
2,201,1995,females,35-44,7153,7825.0,5426.0,5630.0,,
3,201,1995,females,45-54,5805,5051.0,3599.0,3735.0,,
4,201,1995,females,55 and more,16198,6775.0,1491.0,1547.0,,
...,...,...,...,...,...,...,...,...,...,...
113335,3263,2024,males,under 25 years,1489,1556.0,483.0,511.0,456.0,478.0
113336,3263,2024,males,25-34,2464,1898.0,1395.0,1477.0,1733.0,1646.0
113337,3263,2024,males,35-44,3441,3070.0,2051.0,2171.0,2803.0,2662.0
113338,3263,2024,males,45-54,2795,3132.0,2017.0,2135.0,2860.0,2715.0


In [10]:
ru_sa_table = ru_sa_table.join(
    psa.set_index(["county_code", "year", "sex", "age_group"]), how="outer"
).reset_index()
ru_sa_table

Unnamed: 0,county_code,year,sex,age_group,ru,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active
0,201,1995,females,25-34,,5751.0,5740.0,4129.0,4285.0,,
1,201,1995,females,35-44,,7153.0,7825.0,5426.0,5630.0,,
2,201,1995,females,45-54,,5805.0,5051.0,3599.0,3735.0,,
3,201,1995,females,55 and more,,16198.0,6775.0,1491.0,1547.0,,
4,201,1995,females,under 25 years,,4001.0,6879.0,2430.0,2522.0,,
...,...,...,...,...,...,...,...,...,...,...,...
117395,3263,2025,males,25-34,52.0,,,,,,
117396,3263,2025,males,35-44,92.0,,,,,,
117397,3263,2025,males,45-54,118.0,,,,,,
117398,3263,2025,males,55 and more,118.0,,,,,,


In [11]:
ru_sa_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117400 entries, 0 to 117399
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   county_code     117400 non-null  int64  
 1   year            117400 non-null  int64  
 2   sex             117400 non-null  object 
 3   age_group       117400 non-null  object 
 4   ru              98510 non-null   float64
 5   psa1_nc_pop     113340 non-null  float64
 6   psa2_yr_pop     113340 non-null  float64
 7   psa3a_employed  113340 non-null  float64
 8   psa3b_active    113340 non-null  float64
 9   psa4a_employed  56970 non-null   float64
 10  psa4b_active    22800 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 9.9+ MB


Can extend psa1 - the nc 2021 - to 2025 ru measure

In [12]:
# mask = ru_sa_table["year"] == 2025

# ru_sa_table.loc[mask, "psa1_nc_pop"] = (
#     ru_sa_table.groupby(["county_code", "sex", "age_group"])["psa1_nc_pop"]
#       .transform("first")[mask]
# )

ru_sa_table["psa1_nc_pop"] = ru_sa_table["psa1_nc_pop"].fillna(
    ru_sa_table.groupby(["county_code", "sex", "age_group"])["psa1_nc_pop"].transform("first")
)

ru_sa_table

Unnamed: 0,county_code,year,sex,age_group,ru,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active
0,201,1995,females,25-34,,5751.0,5740.0,4129.0,4285.0,,
1,201,1995,females,35-44,,7153.0,7825.0,5426.0,5630.0,,
2,201,1995,females,45-54,,5805.0,5051.0,3599.0,3735.0,,
3,201,1995,females,55 and more,,16198.0,6775.0,1491.0,1547.0,,
4,201,1995,females,under 25 years,,4001.0,6879.0,2430.0,2522.0,,
...,...,...,...,...,...,...,...,...,...,...,...
117395,3263,2025,males,25-34,52.0,2464.0,,,,,
117396,3263,2025,males,35-44,92.0,3441.0,,,,,
117397,3263,2025,males,45-54,118.0,2795.0,,,,,
117398,3263,2025,males,55 and more,118.0,6582.0,,,,,


In [13]:
check_table = ru_sa_table.groupby(["county_code", "sex", "age_group"])["psa1_nc_pop"].agg(["mean", "std"])
check_table[check_table["std"]>0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,std
county_code,sex,age_group,Unnamed: 3_level_1,Unnamed: 4_level_1


In [14]:
ru_sa_table[ru_sa_table["county_code"]==3263].head(10)
ru_sa_table[ru_sa_table["county_code"]==3263].tail(10)

Unnamed: 0,county_code,year,sex,age_group,ru,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active
117390,3263,2025,females,25-34,59.0,2401.0,,,,,
117391,3263,2025,females,35-44,87.0,3285.0,,,,,
117392,3263,2025,females,45-54,79.0,2800.0,,,,,
117393,3263,2025,females,55 and more,44.0,8534.0,,,,,
117394,3263,2025,females,under 25 years,19.0,1484.0,,,,,
117395,3263,2025,males,25-34,52.0,2464.0,,,,,
117396,3263,2025,males,35-44,92.0,3441.0,,,,,
117397,3263,2025,males,45-54,118.0,2795.0,,,,,
117398,3263,2025,males,55 and more,118.0,6582.0,,,,,
117399,3263,2025,males,under 25 years,18.0,1489.0,,,,,


In [15]:
ru_sa_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117400 entries, 0 to 117399
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   county_code     117400 non-null  int64  
 1   year            117400 non-null  int64  
 2   sex             117400 non-null  object 
 3   age_group       117400 non-null  object 
 4   ru              98510 non-null   float64
 5   psa1_nc_pop     117400 non-null  float64
 6   psa2_yr_pop     113340 non-null  float64
 7   psa3a_employed  113340 non-null  float64
 8   psa3b_active    113340 non-null  float64
 9   psa4a_employed  56970 non-null   float64
 10  psa4b_active    22800 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 9.9+ MB


In [16]:
ru_sa_table[ru_sa_table["psa1_nc_pop"].isna()]

Unnamed: 0,county_code,year,sex,age_group,ru,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active


In [17]:
ru_sa_table[ru_sa_table["county_code"]==3218].head(25)

Unnamed: 0,county_code,year,sex,age_group,ru,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active
116210,3218,2000,females,25-34,,2015.0,,,,,
116211,3218,2000,females,35-44,,2530.0,,,,,
116212,3218,2000,females,45-54,,2073.0,,,,,
116213,3218,2000,females,55 and more,,6526.0,,,,,
116214,3218,2000,females,under 25 years,,1641.0,,,,,
116215,3218,2000,males,25-34,,2250.0,,,,,
116216,3218,2000,males,35-44,,2835.0,,,,,
116217,3218,2000,males,45-54,,2288.0,,,,,
116218,3218,2000,males,55 and more,,5319.0,,,,,
116219,3218,2000,males,under 25 years,,1798.0,,,,,


In [18]:
# Save
ru_sa_table.to_csv(repo_root / "cleaned/03_01_outcome_tables/unemploy_sex_age_table.csv")

ru_sa_table

Unnamed: 0,county_code,year,sex,age_group,ru,psa1_nc_pop,psa2_yr_pop,psa3a_employed,psa3b_active,psa4a_employed,psa4b_active
0,201,1995,females,25-34,,5751.0,5740.0,4129.0,4285.0,,
1,201,1995,females,35-44,,7153.0,7825.0,5426.0,5630.0,,
2,201,1995,females,45-54,,5805.0,5051.0,3599.0,3735.0,,
3,201,1995,females,55 and more,,16198.0,6775.0,1491.0,1547.0,,
4,201,1995,females,under 25 years,,4001.0,6879.0,2430.0,2522.0,,
...,...,...,...,...,...,...,...,...,...,...,...
117395,3263,2025,males,25-34,52.0,2464.0,,,,,
117396,3263,2025,males,35-44,92.0,3441.0,,,,,
117397,3263,2025,males,45-54,118.0,2795.0,,,,,
117398,3263,2025,males,55 and more,118.0,6582.0,,,,,
