In [49]:
import pandas as pd
import glob
import numpy as np
import pathlib as pl
import os

### Check non-PKS datasets one by one

In [50]:
PATH_TO_PROCESSED_DATA = pl.Path('../../datasets/processed/non-pks')


all_non_pks_files = glob.glob("../../datasets/raw/*.csv")
all_non_pks_files

['../../datasets/raw\\consumer_price_index_2000_2023.csv',
 '../../datasets/raw\\foreigners_in_germany_2000_2023.csv',
 '../../datasets/raw\\hospitals_general_2000_2023.csv',
 '../../datasets/raw\\income_and_expenditure_of_households_2000_2022.csv',
 '../../datasets/raw\\migration_total_2000_2023.csv',
 '../../datasets/raw\\migration_total_2008_2023_by_age.csv',
 '../../datasets/raw\\net_income_risk_of_poverty2008_2019_1_2.csv',
 '../../datasets/raw\\net_income_risk_of_poverty2020-2023_2_2.csv',
 '../../datasets/raw\\persons_in_employment_2008_2024.csv',
 '../../datasets/raw\\school_graduates_and_dropouts_2000_2023.csv',
 '../../datasets/raw\\suspects_1993-2022.csv',
 '../../datasets/raw\\total_criminal_offences_1993_2022.csv',
 '../../datasets/raw\\total_number_of_suspects_1993_2022.csv',
 '../../datasets/raw\\unemployment_2008_2024.csv']

##### Helper functions

In [51]:
def purge_est_cols(df):
    for col in df.columns:
        if col.endswith("est"):
            df = df.drop([col],axis=1)

    return df


def mark_dupe_cols(df):
    d = {}
    new_cols = []
    for col in df.columns:
        if col in d.keys():
            new_cols.append(col + "_dupe")
        else:
            d[col] = True   
            new_cols.append(col)  
    df.columns = new_cols
    return df

def purge_dupe(df):
    for col in df.columns:
        if col.endswith("_dupe"):
            df = df.drop([col],axis=1)

    return df

def replace_unnamed_with_est(df):
    for (i, col) in enumerate(df.columns):
        if (col.startswith("Unnamed")) and i != 0:
            df.rename(columns={col: df.columns[i-1] + "_est"}, inplace=True)

    return df


def ffill_header(df, header_lvl):
    new_cols = [[0] ]* header_lvl
    for i in range(header_lvl):
        row = df.columns.get_level_values(i)
        new_row = [""] * len(row)
        good_val_met = False
        fill_word = ""
        for j in range(len(row)):
            name = row[j]

            if name.startswith("Unnamed"):
                if good_val_met:
                    new_row[j] = fill_word
                else:
                    new_row[j] = "Year" + str(j) + str(i)
            else:
                good_val_met = True
                fill_word = name
                new_row[j] = name
        new_cols[i] = new_row
    df.columns = new_cols
    return df


### Get all of the datasets loaded and prepared
Loads the datasets and does some basic cleaning. The prepared datasets are stored in the _datasets_ list, in the form _datasets[i] = (DataFrame, filepath to csv)_. 

The data is taken year by year, as the majority of the datasets do not have monthly info. Many of datasets of an indicator column for each numerical column, which has additional information about the value (e.g. whether it is an estimate or not). I assumed we would not need them so I stripped them for now. Other problems are discussed on a case by case basis.

#### Consumer Price Index

In [52]:
datasets = []
current_file = 0
df = pd.read_csv(all_non_pks_files[current_file], skiprows=6, sep=';', usecols=[0,1,3],  names=["Year", "Consumer price index", "Annual change (%)"])
datasets.append((df, all_non_pks_files[current_file]))
df.drop(df.tail(3).index, inplace=True)
df

Unnamed: 0,Year,Consumer price index,Annual change (%)
0,2000,75.5,1.3
1,2001,77.0,2.0
2,2002,78.1,1.4
3,2003,78.9,1.0
4,2004,80.2,1.6
5,2005,81.5,1.6
6,2006,82.8,1.6
7,2007,84.7,2.3
8,2008,86.9,2.6
9,2009,87.2,0.3


#### Foreigners in Germany
Has more entries per year documenting the regions of origin. In some of the years there are no records for some regions, denoted by a value of "-". Proposal: focus on the total number of foreigners instead, region analysis is going to be more difficult.


In [53]:
#Foreigners in Germany 1
current_file = 1
df = pd.read_csv(all_non_pks_files[current_file],  sep=";", skiprows=8 ,names=["Year", "Region", "Male", "Male_est", "Female", "Female_est", "Total", "Total_est"])
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df.loc[:, "Year"]
df = purge_est_cols(df)
df = df[~(df["Region"] == "videlicet:")]
datasets.append((df, all_non_pks_files[current_file]))
df.drop(df.tail(4).index, inplace=True)
df

Unnamed: 0,Year,Region,Male,Female,Total
0,2000,Total,3959376,3337441,7296817
2,2000,Europe,3155602,2702107,5857709
3,2000,EU-27 (since 2020-02-01),-,-,-
4,2000,EU-27 third countries (since 2020-02-01),-,-,-
5,2000,EU-28 (until 2020-01-31),1456057,1215062,2671119
...,...,...,...,...,...
1003,2023,Territory of former Yugoslavia,792825,710015,1502840
1004,2023,Territory of former Serbia and Montenegro,324905,291070,615975
1005,2023,Territory of the former Soviet Union,780850,1155265,1936115
1006,2023,Territory of former Czechoslovakia,63300,69475,132775


#### Hospitals
Removed the first row which contained the units of measurment for each column.

In [54]:
#Hospitals 2
current_file = 2
df = pd.read_csv(all_non_pks_files[current_file], skiprows=4,  sep=";")
df=replace_unnamed_with_est(df)
df=purge_est_cols(df)
df.rename(columns={df.columns[0]:"Year"}, inplace=True)
df = df.loc[1:,:]
df.drop(df.tail(3).index, inplace=True)
datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Hospitals,Beds,Beds per 100 000 inhabitants,Patients,Patients per 100 000 inhabitants,Occupancy / billing days,Average length of stay,Average occupancy of hospital beds
1,2000,2242,559651,681,17262929,21004,167789.0,9.7,81.9
2,2001,2240,552680,671,17325083,21041,163536.0,9.4,81.1
3,2002,2221,547284,664,17432272,21135,159937.0,9.2,80.1
4,2003,2197,541901,657,17295910,20960,153518.0,8.9,77.6
5,2004,2166,531333,644,16801649,20365,146746.0,8.7,75.5
6,2005,2139,523824,635,16539398,20056,143244.0,8.7,74.9
7,2006,2104,510767,620,16832883,20437,142251.0,8.5,76.3
8,2007,2087,506954,616,17178573,20883,142893.0,8.3,77.2
9,2008,2083,503360,613,17519579,21334,142535.0,8.1,77.4
10,2009,2084,503341,615,17817180,21762,142414.0,8.0,77.5


#### Income and expenditure of households
This dataset is more difficult, what data is contained in each row is described by the Type and Subtype columns, I don't know how to combine them well so it stays like this for now. Also has more entries per year. I also removed the Unit column which described the units.

In [55]:
#Income and expenditure - 3
current_file = 3
df = pd.read_csv(all_non_pks_files[current_file], skiprows=6,  sep=";")
df.rename(columns={df.columns[0]:"Year",df.columns[1]:"Type",df.columns[2]:"Subtype",df.columns[3]:"Unit"}, inplace=True)
df = replace_unnamed_with_est(df)
df = purge_est_cols(df)
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df.loc[:, "Type"] = df.loc[:, "Type"].ffill()
df.drop(df.tail(3).index, inplace=True)
df.drop(["Unit"], axis=1, inplace=True)
datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Type,Subtype,Households with 1 person,Households with 2 persons,Households with 3 persons,Households with 4 persons,Households with 5 or more persons,Total
0,2000,Households covered,,1602.0,2106.0,870.0,1039.0,-,5757.0
1,2000,Households extrapolated,,12666.0,11935.0,4939.0,3865.0,-,34777.0
2,2000,Average amount per household and month,Household gross income,1965.0,3458.0,4631.0,4787.0,-,3303.0
3,2000,Average amount per household and month,Household net income,1538.0,2763.0,3555.0,3616.0,-,2538.0
4,2000,Average amount per household and month,Disposable income and receipts,1572.0,2931.0,3635.0,3691.0,-,2642.0
...,...,...,...,...,...,...,...,...,...
135,2022,Average amount per household and month,Household gross income,3164.0,5930.0,7438.0,8847.0,9051,5296.0
136,2022,Average amount per household and month,Household net income,2439.0,4621.0,5569.0,6569.0,6928,4056.0
137,2022,Average amount per household and month,Disposable income and receipts,2479.0,4702.0,5677.0,6664.0,7032,4125.0
138,2022,Average amount per household and month,Household final consumption expenditure,1833.0,3239.0,3760.0,4322.0,4632,2846.0


#### Migration 
The column names are generated by compounding a 3-level header, so that is why they look the way they do. Without this it was hard to parse what each means.

In [56]:
#Migration total 4
current_file = 4
df = pd.read_csv(all_non_pks_files[current_file], header=[5,6,7],  sep=";")
df = df.loc[1:,:]
df = ffill_header(df, 3)
df.columns = [f'{i}_{j}_{k}' for i, j,k in df.columns]
df.rename(columns={(df.columns[0]):"Year"}, inplace=True)
df.drop(df.tail(3).index, inplace=True)
df = mark_dupe_cols(df)
df = purge_dupe(df)
datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Germans_Male_Arrivals,Germans_Male_Departures,Germans_Male_Migration balance,Germans_Female_Arrivals,Germans_Female_Departures,Germans_Female_Migration balance,Germans_Total_Arrivals,Germans_Total_Departures,Germans_Total_Migration balance,...,Foreigners_Total_Migration balance,Total_Male_Arrivals,Total_Male_Departures,Total_Male_Migration balance,Total_Female_Arrivals,Total_Female_Departures,Total_Female_Migration balance,Total_Total_Arrivals,Total_Total_Departures,Total_Total_Migration balance
1,2000,603223,561172,42051,590714,552104,38610,1193937,1113276,80661,...,86454,1065059,1004023,61036,912732,806653,106079,1977791,1810676,167115
2,2001,619022,574607,44415,610109,567526,42583,1229131,1142133,86998,...,188760,1106217,981032,125185,953821,803248,150573,2060038,1784280,275758
3,2002,597681,562979,34702,591712,559844,31868,1189393,1122823,66570,...,152775,1064669,984134,80535,931369,792559,138810,1996038,1776693,219345
4,2003,573846,554434,19412,565140,544601,20539,1138986,1099035,39951,...,102697,1003715,956268,47447,880118,784917,95201,1883833,1741185,142648
5,2004,579988,564641,15347,555268,543289,11979,1135256,1107930,27326,...,55218,1010173,990934,19239,864503,801198,63305,1874676,1792132,82544
6,2005,539175,548618,-9443,530520,537840,-7320,1069695,1086458,-16763,...,95717,951792,930436,21356,826074,768476,57598,1777866,1698912,78954
7,2006,519466,546920,-27454,512179,536627,-24448,1031645,1083547,-51902,...,74694,923149,923638,-489,791842,768561,23281,1714991,1692199,22792
8,2007,530607,559278,-28671,526001,552420,-26419,1056608,1111698,-55090,...,99003,944393,932859,11534,813566,781187,32379,1757959,1714046,43913
9,2008,541586,575834,-34248,540201,572368,-32167,1081787,1148202,-66415,...,10691,957989,1001563,-43574,828041,840191,-12150,1786030,1841754,-55724
10,2009,534139,552319,-18180,532441,554516,-22075,1066580,1106835,-40255,...,27542,966902,985150,-18248,835398,829863,5535,1802300,1815013,-12713


#### Migration by age
Almost the exact same thing as above, but column two is now age groups for the given year.

In [57]:
#Migration by age - 5
current_file = 5
df = pd.read_csv(all_non_pks_files[current_file], header=[5,6,7],  sep=";")
df = df.loc[1:,:]
df = ffill_header(df, 3)
df.columns = [f'{i}_{j}_{k}' for i, j,k in df.columns]
df.rename(columns={(df.columns[0]):"Year", df.columns[1]:"Age group"}, inplace=True)
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df.drop(df.tail(3).index, inplace=True)
df = mark_dupe_cols(df)
df = purge_dupe(df)
datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Age group,Germans_Male_Arrivals,Germans_Male_Departures,Germans_Male_Migration balance,Germans_Female_Arrivals,Germans_Female_Departures,Germans_Female_Migration balance,Germans_Total_Arrivals,Germans_Total_Departures,...,Foreigners_Total_Migration balance,Total_Male_Arrivals,Total_Male_Departures,Total_Male_Migration balance,Total_Female_Arrivals,Total_Female_Departures,Total_Female_Migration balance,Total_Total_Arrivals,Total_Total_Departures,Total_Total_Migration balance
1,2008,under 1 year,4338,3465,873,4044,3312,732,8382,6777,...,1494,5678,4031,1647,5326,3874,1452,11004,7905,3099
2,2008,1 year,8666,8593,73,8016,7901,115,16682,16494,...,1966,11075,10001,1074,10366,9286,1080,21441,19287,2154
3,2008,2 years,7115,7434,-319,6606,6955,-349,13721,14389,...,1328,9281,8936,345,8616,8301,315,17897,17237,660
4,2008,3 years,6367,6927,-560,6213,6690,-477,12580,13617,...,957,8365,8444,-79,8098,8099,-1,16463,16543,-80
5,2008,4 years,5403,6052,-649,5089,5650,-561,10492,11702,...,663,7263,7598,-335,6879,7091,-212,14142,14689,-547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548,2023,92 years,219,245,-26,576,620,-44,795,865,...,-29,233,278,-45,615,669,-54,848,947,-99
1549,2023,93 years,177,185,-8,489,539,-50,666,724,...,-15,196,214,-18,522,577,-55,718,791,-73
1550,2023,94 years,140,152,-12,348,394,-46,488,546,...,-9,150,173,-23,371,415,-44,521,588,-67
1551,2023,95 years and over,245,270,-25,842,970,-128,1087,1240,...,-109,257,337,-80,885,1067,-182,1142,1404,-262


#### Net income, risk of poverty 2008-2019
Has more entries per year per each age group.

In [58]:
#Net income and risk of poverty years 2009-2019  6 
current_file = 6
df = pd.read_csv(all_non_pks_files[current_file], skiprows=6, sep=";")
df = df.loc[1:,:]
df.rename(columns={(df.columns[0]):"Year", df.columns[1]:"Sex", df.columns[2]:"Age groups"}, inplace=True)
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df.loc[:, "Sex"] = df.loc[:, "Sex"].ffill()
df = replace_unnamed_with_est(df)
df = purge_est_cols(df)
df.drop(df.tail(3).index, inplace=True)

datasets.append((df, all_non_pks_files[current_file]))
df


Unnamed: 0,Year,Sex,Age groups,Persons covered,Persons extrapolated,Equivalised net income (median),At-risk-of-poverty rate
1,2008,Male,16 to under 18 years,348,968.0,17723,19.0
2,2008,Male,18 to under 25 years,960,3510.0,18731,18.0
3,2008,Male,25 to under 50 years,4321,13873.0,20103,12.8
4,2008,Male,50 to under 65 years,2996,7518.0,20565,16.0
5,2008,Male,65 years and over,3072,7578.0,17147,12.0
...,...,...,...,...,...,...,...
212,2019,Total,18 to under 25 years,1285,6069.0,22099,18.8
213,2019,Total,25 to under 50 years,5975,26119.0,25188,12.3
214,2019,Total,50 to under 65 years,6772,19051.0,25537,16.1
215,2019,Total,65 years and over,6718,16640.0,20425,18.0


#### Net income, risk of poverty 2020-2023
This is apparently the continuation of the previous dataset, but in this they started measuring completely different metrics, so this one is by household types instead of age groups and gender.

In [59]:
#Net income and risk of poverty years 2020-2023 7
current_file = 7
df = pd.read_csv(all_non_pks_files[current_file], skiprows=6, sep=";")
df = df.loc[1:,:]
df.rename(columns={(df.columns[0]):"Year", df.columns[1]:"Household type"}, inplace=True)
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df = replace_unnamed_with_est(df)
df = purge_est_cols(df)
df.drop(df.tail(3).index, inplace=True)

datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Household type,Total,Persons at risk of poverty or exclusion,Persons at risk of poverty,Severely materially and socially deprived persons,Persons in househ. w. very low employm. intensity
1,2020,Households without children,48183.0,9994.0,8383.0,1865,2506.0
2,2020,Persons living alone,16468.0,5535.0,4728.0,1212,1608.0
3,2020,Two adults without children,25660.0,3694.0,3081.0,516,728.0
4,2020,Three or more adults without children,6054.0,765.0,575.0,137,170.0
5,2020,Households with children,33972.0,6741.0,4820.0,1720,2611.0
6,2020,Lone parents,3686.0,1729.0,1142.0,485,959.0
7,2020,Two adults with children,26350.0,4393.0,3234.0,1077,1441.0
8,2020,Three or more adults with children,3936.0,619.0,444.0,159,211.0
9,2020,Total,82175.0,16735.0,13203.0,3585,5118.0
10,2021,Households without children,47793.0,9907.0,7911.0,1903,2857.0


#### Persons in employment
Has entries by each quarter, possibly a target for shrinking into just years. Also "Type of values" has only 2 values - "Unadjusted values" and "X13 JDemetra+  seasonally adjusted", I do not really understand it, but maybe we could only use of them?

In [60]:
#Persons in employment - 8
current_file = 8
df = pd.read_csv(all_non_pks_files[current_file], header=[6,7], sep=";")
df = df.loc[1:,:]

df = ffill_header(df, 2)
df.columns = [f'{i}_{j}' for i, j in df.columns]
df.rename(columns={df.columns[0]:"Type of values", df.columns[1]:"Year", df.columns[2]:"Quarter"}, inplace=True)
df = df.astype({"Year": "string"})
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df = mark_dupe_cols(df)
df = purge_dupe(df)
df.loc[:, "Type of values"] = df.loc[:, "Type of values"].ffill()
df.drop(df.tail(3).index, inplace=True)
datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Type of values,Year,Quarter,Place of work in Germany_Employees,Place of work in Germany_Rate of change on previous quarter,Place of work in Germany_Rate of change on previous year's quarter,Place of residence in Germany_Employees,Place of residence in Germany_Rate of change on previous quarter,Place of residence in Germany_Rate of change on previous year's quarter
1,Unadjusted values,2008,Quarter 1,35915,-1.1,1.9,35904,-0.9,1.9
2,Unadjusted values,2008,Quarter 2,36246,0.9,1.6,36160,0.7,1.7
3,Unadjusted values,2008,Quarter 3,36561,0.9,1.5,36432,0.8,1.6
4,Unadjusted values,2008,Quarter 4,36765,0.6,1.3,36714,0.8,1.4
5,Unadjusted values,2009,Quarter 1,36170,-1.6,0.7,36167,-1.5,0.7
...,...,...,...,...,...,...,...,...,...
132,X13 JDemetra+ seasonally adjusted,2023,Quarter 4,42237,0.1,x,42029,0.1,x
133,X13 JDemetra+ seasonally adjusted,2024,Quarter 1,42289,0.1,x,42086,0.1,x
134,X13 JDemetra+ seasonally adjusted,2024,Quarter 2,42322,0.1,x,42121,0.1,x
135,X13 JDemetra+ seasonally adjusted,2024,Quarter 3,42280,-0.1,x,42082,-0.1,x


#### School graduates and dropouts
3 entries per year - for Men, Women and the total amount.

In [61]:
#School graduates and dropouts
print(current_file)
current_file = 9
df = pd.read_csv(all_non_pks_files[current_file], skiprows=5,  sep=";")
df = df.loc[1:,:]
df.rename(columns={(df.columns[1]):"Year", df.columns[0]:"Sex"}, inplace=True)
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df.loc[:, "Sex"] = df.loc[:, "Sex"].ffill()
df = replace_unnamed_with_est(df)
df = purge_est_cols(df)
df.drop(df.tail(3).index, inplace=True)

datasets.append((df, all_non_pks_files[current_file]))
df


8


Unnamed: 0,Sex,Year,Pupils,School beginners,School leavers (graduates and dropouts)
1,Male,2000,5062891,419055,473494
2,Male,2001,5016905,401771,476530
3,Male,2002,4969732,413783,483269
4,Male,2003,4941969,433303,500920
5,Male,2004,4889799,422289,488378
...,...,...,...,...,...
68,Total,2019,8326884,733443,749946
69,Total,2020,8380767,749820,768191
70,Total,2021,8436221,770682,769411
71,Total,2022,8693344,813540,782423


#### Suspects
Multi level header, otherwise nothing special.

In [62]:
current_file = 10
df = pd.read_csv(all_non_pks_files[current_file], header=[5,6],  sep=";")
df = ffill_header(df, 2)
df.columns = [f'{i}_{j}' for i, j in df.columns]
df.rename(columns={(df.columns[0]):"Year"}, inplace=True)
df = mark_dupe_cols(df)
df = purge_dupe(df)
df.drop(df.tail(3).index, inplace=True)

datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Germans_Male,Germans_Female,Foreigners_Male,Foreigners_Female
0,1993,992608.0,302768.0,552382.0,115741.0
1,1994,1028234.0,317113.0,491962.0,100342.0
2,1995,1083993.0,336442.0,479194.0,101857.0
3,1996,1127829.0,352794.0,494045.0,107615.0
4,1997,1157673.0,365165.0,494694.0,111768.0
5,1998,1185583.0,380122.0,487841.0,113575.0
6,1999,1170218.0,368350.0,459938.0,114008.0
7,2000,1203643.0,374272.0,450260.0,112363.0
8,2001,1215912.0,378039.0,433074.0,110541.0
9,2002,1252792.0,394033.0,431672.0,113107.0


#### Total criminal offences

In [63]:
current_file = 11
df = pd.read_csv(all_non_pks_files[current_file], header=[4],  sep=";")
df=df.loc[1:,:]
#df.columns = [f'{i}_{j}' for i, j in df.columns]
df.rename(columns={(df.columns[0]):"Year"}, inplace=True)
df = replace_unnamed_with_est(df)
df = purge_est_cols(df)
df.drop(df.tail(4).index, inplace=True)

datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Registered criminal offences,Cleared-up criminal offences
1,1993,6750613,2957135
2,1994,6537748,2899733
3,1995,6668717,3068379
4,1996,6647598,3255042
5,1997,6586165,3335016
6,1998,6456996,3376524
7,1999,6302316,3329124
8,2000,6264723,3335356
9,2001,6363865,3379618
10,2002,6507394,3425416


#### Total number of suspects 
This is the exact same csv as suspects_1993-2022.csv, skipping..

In [64]:
current_file = 12

#### Unemployment
Has entries per month, should probably reduce it to years.

In [65]:
current_file = 13
df = pd.read_csv(all_non_pks_files[current_file], header=[6,7],  sep=";")
df = ffill_header(df, 2)
df.columns = [f'{i}_{j}' for i, j in df.columns]
df.rename(columns={(df.columns[0]):"Year", df.columns[1]:"Month"}, inplace=True)
df=df.loc[1:,:]
df = mark_dupe_cols(df)
df = purge_dupe(df)
df.loc[:, "Year"] = df.loc[:, "Year"].ffill().str.slice(0,4)
df.drop(df.tail(4).index, inplace=True)

datasets.append((df, all_non_pks_files[current_file]))
df

Unnamed: 0,Year,Month,Unadjusted values_Economically active population,Unadjusted values_Persons in employment,Unadjusted values_Unemployed persons,Unadjusted values_Unemployment rate,X13 JDemetra+ trend_Economically active population,X13 JDemetra+ trend_Persons in employment,X13 JDemetra+ trend_Unemployed persons,X13 JDemetra+ trend_Unemployment rate,BV4.1 trend_Economically active population,BV4.1 trend_Persons in employment,BV4.1 trend_Unemployed persons,BV4.1 trend_Unemployment rate
1,2008,January,40.42,37.11,3.31,8.2,40.63,37.40,3.23,8.0,40.65,37.42,3.23,8.0
2,2008,February,40.34,37.07,3.27,8.1,40.63,37.44,3.19,7.9,40.66,37.47,3.19,7.8
3,2008,March,40.43,37.18,3.25,8.0,40.62,37.47,3.15,7.8,40.67,37.52,3.15,7.7
4,2008,April,40.49,37.28,3.21,7.9,40.62,37.51,3.11,7.7,40.68,37.57,3.11,7.6
5,2008,May,40.31,37.37,2.94,7.3,40.63,37.56,3.07,7.6,40.66,37.60,3.06,7.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,2024,August,44.30,42.60,1.70,3.8,44.49,42.93,1.56,3.5,44.54,42.96,1.58,3.6
201,2024,September,44.72,43.25,1.47,3.3,44.51,42.96,1.55,3.5,44.56,42.96,1.60,3.6
202,2024,October,...,...,...,...,...,...,...,...,...,...,...,...
203,2024,November,...,...,...,...,...,...,...,...,...,...,...,...


In [68]:
print(len(datasets))
datasets[0]

13


(    Year  Consumer price index  Annual change (%)
 0   2000                  75.5                1.3
 1   2001                  77.0                2.0
 2   2002                  78.1                1.4
 3   2003                  78.9                1.0
 4   2004                  80.2                1.6
 5   2005                  81.5                1.6
 6   2006                  82.8                1.6
 7   2007                  84.7                2.3
 8   2008                  86.9                2.6
 9   2009                  87.2                0.3
 10  2010                  88.1                1.0
 11  2011                  90.0                2.2
 12  2012                  91.7                1.9
 13  2013                  93.1                1.5
 14  2014                  94.0                1.0
 15  2015                  94.5                0.5
 16  2016                  95.0                0.5
 17  2017                  96.4                1.5
 18  2018                  98.1