## In a similar fashion as for the Arbeitsstellen, this code scrapes the data for Berufsausbildungsstellen, Bewerber, etc. from the website of the BA

Source: https://statistik.arbeitsagentur.de/SiteGlobals/Forms/Suche/Einzelheftsuche_Formular.html?nn=1459818&topic_f=ausb-ausbildungsstellenmarkt-mit-zkt

Unfortunately, I didn't safe the part of the code that dowloads all Excel and Pdf files from the BA's website but I do have the folder with all those files. The code for the dowload of those files, however, can be easily replicated following the same logic as the code for the dowload of the Arbeitsstellen data from the BA. 

In [7]:

# Settings

import pandas as pd

pd.set_option('display.max.rows', None)
pd.set_option('display.max.columns', None)




Between 2020 and 2025 the data is stored in excel files from which we extract the relevant tables and merge together. 

In [None]:
# Extract data from Excel files
########################################################################

import os
import pandas as pd
import re

# Path to the folder containing Excel files
excel_folder = r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\BA_data\B_Bewerber und Berufsausbildungsstellen\arbeitsagentur_excels_Ausbildung"

# List to collect DataFrames
all_dfs = []

# Define ALL acceptable title snippets (case-insensitive, partial match)
title_snippets = [
    "bewerberinnen und bewerber für berufsausbildungsstellen und berufsausbildungsstellen nach berufsbereichen und -gruppen",
    "bewerberinnen und bewerber sowie betriebliche berufsausbildungsstellen nach berufen",
]

# Loop through all Excel files in the folder
for file in os.listdir(excel_folder):
    if file.endswith(".xlsx"):
        filepath = os.path.join(excel_folder, file)

        # Extract metadata from filename
        match = re.search(r"ausbildungsstellenmarkt-mit-zkt-(\d{2})-0-(\d{6})", file)
        if not match:
            print(f"⚠️ Filename doesn't match expected pattern: {file}")
            continue

        bundesland = match.group(1)
        year = int(match.group(2)[:4])
        month = int(match.group(2)[4:])

        # ✅ Filter: Keep only files between November 2020 and March 2025
        if (year < 2020) or (year == 2020 and month < 11) or (year == 2025 and month > 3) or (year > 2025):
            continue

        try:
            xls = pd.ExcelFile(filepath)
            sheet_names = xls.sheet_names

            # Search for the correct sheet based on sheet content (first rows)
            sheet_name = None
            for s in sheet_names:
                try:
                    preview = pd.read_excel(xls, sheet_name=s, nrows=3, header=None)
                    first_rows_text = " ".join(preview.astype(str).fillna("").values.flatten()).lower()

                    if any(snippet in first_rows_text for snippet in title_snippets):
                        sheet_name = s
                        break
                except Exception as e:
                    print(f"⚠️ Could not preview sheet {s} in {file}: {e}")

            if not sheet_name:
                print(f"❌ No sheet with expected title found in {file}. Sheets: {sheet_names}")
                continue
            else:
                print(f"✅ Selected sheet by content: {sheet_name} in file: {file}")
            
            # Try finding the header row by previewing the top 20 rows
            preview = pd.read_excel(xls, sheet_name=sheet_name, nrows=12, header=None)

            # Search for row that looks like a valid header (tweak logic as needed)
            for i, row in preview.iterrows():
                if "1" in str(row.values) and "2" in str(row.values) and "3" in str(row.values) and "4" in str(row.values):
                    header_row = i
                    break
            else:
                print(f"⚠️ Could not find valid header row in {file}.")
                continue

            # Now read the full data starting from the detected header row
            df = pd.read_excel(xls, sheet_name=sheet_name, header=header_row)

            # Add metadata
            df["Bundesland"] = bundesland
            df["Year"] = year
            df["Month"] = month
            df["source_file"] = file

            all_dfs.append(df)

        except Exception as e:
            print(f"❌ Error reading file {file}: {e}")

# Combine all DataFrames
if all_dfs:
    combined_excel_df = pd.concat(all_dfs, ignore_index=True)
    print(f"✓ Combined {len(all_dfs)} files. Total rows: {combined_excel_df.shape[0]}")
    print(combined_excel_df.head())
else:
    print("❗ No valid Excel files were processed.")




The extraction leads to a data frame for which the columns are a little bit deranged. In particular, the relevant columns are sometimes stored in 1 to 12 and sometimes 17 to 39, so the following code renames the columns and aligns the data frame correctly.

In [None]:
def clean_column_names(df):
    new_columns = []

    seen = {}
    for col in df.columns:
        if isinstance(col, int) or (isinstance(col, str) and col.isdigit()):
            col_str = str(col)
            if col_str in seen:
                new_col = f"dup_{col_str}"
            else:
                new_col = f"col_{col_str}"
                seen[col_str] = True
            new_columns.append(new_col)
        elif col == 'Unnamed: 0':
            new_columns.append("Berufsgruppe")
        else:
            new_columns.append(str(col))

    df.columns = new_columns
    return df
df = clean_column_names(combined_excel_df)
print(df.columns)

right = df.dropna(subset=['dup_1'])
print(right.head(10))

cols_to_drop = [f'col_{i}' for i in range(1, 13)]
right = right.drop(columns=cols_to_drop)
print(right.head(10))

# Rename 'dup_1' to 'col_1', ..., 'dup_12' to 'col_12'
rename_dict = {f'dup_{i}': f'col_{i}' for i in range(1, 13)}
right = right.rename(columns=rename_dict)

# Define the target column order
desired_order = [
    'Berufsgruppe', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6',
    'col_7', 'col_8', 'col_9', 'col_10', 'col_11', 'col_12',
    'Bundesland', 'Year', 'Month', 'source_file'
]

# Reorder the columns to match
right = right[desired_order]
print(right.head(10))

left = df.dropna(subset=['col_1'])
print(left.head(10))

cols_to_drop = [f'dup_{i}' for i in range(1, 13)]
left = left.drop(columns=cols_to_drop)
print(left.head(10))

# Merge the two parts from the Excel data frame: 

combined_df = pd.concat([left, right], ignore_index=True)
print(combined_df.head())

Index(['Berufsgruppe', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6',
       'col_7', 'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'Bundesland',
       'Year', 'Month', 'source_file', 'dup_1', 'dup_2', 'dup_3', 'dup_4',
       'dup_5', 'dup_6', 'dup_7', 'dup_8', 'dup_9', 'dup_10', 'dup_11',
       'dup_12'],
      dtype='object')

Next, I clean the combined data frame (dealing with special characters, adding BKZ, ajusting column names, etc.)

In [None]:
# Clean the combined DataFrame
###########################################################

import numpy as np

# 1. Rename columns properly (based on your screenshots)
combined_df.columns = [
    "Beruf", 
    "Bewerber_Anzahl_Ins", "Bewerber_V_Ins",
    "Bewerber_Unversorgt_Anzahl", "Bewerber_V_Unversorgt",
    "Stellen_Anzahl", "Stellen_V",
    "Unbesetzt_Anzahl", "Unbesetzt_V",
    "Bewerber_pro_100_Stellen", "Bewerber_pro_100_Stellen_V",
    "Unversorgte_pro_100_Unbesetzt", "Unversorgte_pro_100_Unbesetzt_V",
    "Bundesland", "Year", "Month", "source_file"
] 

# 2. Replace "x" and "*" and "-" with NaN
combined_df.replace(["x", "*", "-", ".x"], np.nan, inplace=True)

# 3. Drop rows where Beruf is NaN or irrelevant
combined_df = combined_df[combined_df["Beruf"].notna()]

# 4. Extract BKZ code (1-3 digit code at start of Beruf string)
combined_df["BKZ"] = combined_df["Beruf"].astype(str).str.extract(r"^(\d{1,3})")

# 5. Keep only rows that have a valid BKZ
combined_df = combined_df[combined_df["BKZ"].notna()]

# 6. Reset index
combined_df.reset_index(drop=True, inplace=True)

# 7. Preview cleaned data
combined_df.head(100)


Preview combined_df

In [None]:
combined_df.head(10)

Summerize combined_df

In [108]:
combined_df.describe(include="all")

Unnamed: 0,Beruf,Bewerber_Anzahl_Ins,Bewerber_V_Ins,Bewerber_Unversorgt_Anzahl,Bewerber_V_Unversorgt,Stellen_Anzahl,Stellen_V,Unbesetzt_Anzahl,Unbesetzt_V,Bewerber_pro_100_Stellen,Bewerber_pro_100_Stellen_V,Unversorgte_pro_100_Unbesetzt,Unversorgte_pro_100_Unbesetzt_V,Bundesland,Year,Month,source_file,BKZ
count,92992,84497.0,85226.0,44896.0,47631.0,86275.0,85918.0,46467.0,48191.0,71449.0,75162.0,34709.0,35237.0,92992.0,92992.0,92992.0,92992,92992.0
unique,116,,,,,,,,,,,,,16.0,,,832,116.0
top,111 Landwirtschaft,,,,,,,,,,,,,1.0,,,ausbildungsstellenmarkt-mit-zkt-16-0-202101-xl...,111.0
freq,832,,,,,,,,,,,,,5812.0,,,115,832.0
mean,,294.943051,-0.779236,127.59346,-1.451855,399.924323,0.912596,199.316117,1.073952,83.325111,2.91618,91.099754,1.665302,,2022.481934,6.389711,,
std,,973.365286,38.299426,417.452833,43.420283,1366.782655,37.114933,716.43719,43.196051,171.596997,76.309642,170.877291,99.298857,,1.303234,3.605148,,
min,,0.0,-100.0,0.0,-100.0,0.0,-100.0,0.0,-100.0,0.0,-2380.0,0.0,-3116.190476,,2020.0,1.0,,
25%,,11.0,-15.419904,5.0,-18.75,13.0,-13.68991,6.0,-16.1,1.9,-1.306235,4.216867,-5.314371,,2021.0,3.0,,
50%,,51.0,-0.3,22.0,0.0,63.0,0.0,31.0,0.0,28.44,1.12,35.294118,0.877193,,2022.0,6.0,,
75%,,188.0,10.294118,84.0,10.126582,256.0,10.71,126.0,12.0,91.111111,5.245395,100.0,7.0,,2024.0,10.0,,


Export combined_df which is all data for 2020-2025 extracted from the excel files. 

In [None]:
combined_df.to_csv(r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\BA_data\B_Bewerber und Berufsausbildungsstellen\arbeitsagentur_dataframes_Ausbildungen\combined_excel_df.csv", index=False)

Next, I extract all data before 2020 that is stored in pdf files. To extract the tables from the pdf files, I use the camelot package. Inside the camelot package, I use the network algorithm that identifies the right table alignment and stores the columns. However, the code needs a lot of time to process. Moreover, the desired tables are stored on different pages dependent on the publishing date and bland. I identified a pattern in the page number dependent on their publishing date and define specific pages based on the pattern. 

The following code focuses on the specific pdf file 'ausbildungsstellenmarkt-mit-zkt-14-0-201802-pdf.pdf' to identify the right extraction method and plots the table selection of the network algorithm in the camelot package.

In [None]:
import camelot
import pandas as pd

# Path to the specific PDF file
pdf_document = r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\BA_data\B_Bewerber und Berufsausbildungsstellen\arbeitsagentur_pdfs_Ausbildungen\ausbildungsstellenmarkt-mit-zkt-14-0-201802-pdf.pdf"

# Extract tables from pages 16 and 17
tables = camelot.read_pdf(pdf_document, pages='8,9,10', flavor='network', strip_text="\n")

print(f"✅ Found {len(tables)} table(s)")

if len(tables) >= 2:
    df1 = tables[0].df
    df2 = tables[1].df
    df3 = tables[2].df

    # Combine the two tables
    combined_df = pd.concat([df1, df2], ignore_index=True)

    # Show and save
    print(combined_df.head(10))
    combined_df.to_csv("page16_17_combined.csv", index=False)

else:
    print("❌ Less than 2 tables found, cannot combine.")

    
    
import matplotlib.pyplot as plt
 # Increase size and sharpness
fig = camelot.plot(tables[2], kind ='grid')
fig.set_size_inches(50, 50)

import numpy as np

xs = np.arange(0, 600, 10)
ax = fig.gca()
ax.set_xticks(xs)
# ax.set_yticks(ys)
fig.show()



The the tables extracted from the one specific pdf file are stored in the following data frame. 

In [None]:
combined_df.head(100)

The following code uses the insights from the one specific pdf file and generalizes it to all downloaded pdf files between 2014 and 2015.

In [None]:
import camelot
import pandas as pd
import os
import re

# Path to the folder containing the PDF files
pdf_folder = r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\BA_data\B_Bewerber und Berufsausbildungsstellen\arbeitsagentur_pdfs_Ausbildungen"

# List to collect all DataFrames
all_dfs = []

# Loop through all PDF files in the folder
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        filepath = os.path.join(pdf_folder, file)

        # Extract metadata from filename
        match = re.search(r"ausbildungsstellenmarkt-mit-zkt-(\d{2})-0-(\d{6})", file)
        if not match:
            print(f"⚠️ Skipping: Filename doesn't match expected pattern: {file}")
            continue

        bundesland = match.group(1)
        year = int(match.group(2)[:4])
        month = int(match.group(2)[4:])

        # Filter and assign page range
        if (year == 2014 and month >= 8) or (year == 2015 and month <= 9):
            print(f"✅ Processing: {file} ({bundesland}, {year}-{month:02d})")
            if month in [3, 7, 8, 9]:
                page_range = "17, 18, 19"
            else:
                page_range = "16, 17, 18"
        else:
            print(f"❌ Skipping: {file} ({bundesland}, {year}-{month:02d})")
            continue

        # Try reading and combining tables
        try:
            tables = camelot.read_pdf(filepath, pages=page_range, flavor='network', strip_text="\n")

            if len(tables) >= 2:
                dfs = [tables[i].df for i in range(min(3, len(tables)))]  # Grab up to 3 tables
                combined_df = pd.concat(dfs, ignore_index=True)

                # Add metadata
                combined_df["Bundesland"] = bundesland
                combined_df["Year"] = year
                combined_df["Month"] = month
                combined_df["source_file"] = file

                all_dfs.append(combined_df)
            else:
                print(f"⚠️ Skipping {file}: Less than 2 tables found.")

        except Exception as e:
            print(f"❌ Error reading {file}: {e}")

# Combine and export all data
if all_dfs:
    final_df = pd.concat(all_dfs, ignore_index=True)
    print(f"✓ Combined {len(all_dfs)} files. Total rows: {final_df.shape[0]}")
    print(final_df.head())
    final_df.to_csv("combined_all_pdfs.csv", index=False)
else:
    print("❗ No valid tables extracted from any PDFs.")



Export the final data set: 

In [64]:
final_df = pd.read_csv("combined_all_pdfs.csv")  

Check summary statistics of the final data frame: 

In [None]:
final_df.describe(include="all")

Output preview of the final data frame:

In [None]:
final_df.head(100)



Next, I clean the data frame, including renaming columns, replacing special characters, extracting BKZ, etc. 

In [None]:
import numpy as np


# 2. Rename columns properly (based on your screenshots)
final_df.columns = [
    "Beruf", 
    "Bewerber_Anzahl_Ins", "Bewerber_V_Ins",
    "Bewerber_Unversorgt_Anzahl", "Bewerber_V_Unversorgt",
    "Ausbildungsstellen_Anzahl", "Ausbildungsstellen_V",
    "Ausbildungsstellen_Unbesetzt_Anzahl", "Ausbidlungsstellen_Unbesetzt_V",
    "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vorjahr", "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vor_Vorjahr",
    "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vorjahr", "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vor_Vorjahr",
    "Bundesland", "Year", "Month", "source_file"
] 

# 3. Replace "x" and "*" with NaN
final_df.replace(["x", "*", "-"], np.nan, inplace=True)

# 4. Drop rows where Beruf is NaN or irrelevant
final_df = final_df[final_df["Beruf"].notna()]

# 5. Extract BKZ code (1-3 digit code at start of Beruf string)
final_df["BKZ"] = final_df["Beruf"].astype(str).str.extract(r"^(\d{1,3})")

# 6. Keep only rows that have a valid BKZ
final_df = final_df[final_df["BKZ"].notna()]


# 7. Covnert columns to numeric

# List of columns to aggregate
columns_to_aggregate = [ 
    "Bewerber_Anzahl_Ins", "Bewerber_V_Ins",
    "Bewerber_Unversorgt_Anzahl", "Bewerber_V_Unversorgt",
    "Ausbildungsstellen_Anzahl", "Ausbildungsstellen_V",
    "Ausbildungsstellen_Unbesetzt_Anzahl", "Ausbidlungsstellen_Unbesetzt_V",
    "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vorjahr", "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vor_Vorjahr",
    "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vorjahr", "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vor_Vorjahr"
]

for col in columns_to_aggregate:
    if col in final_df.columns:
        final_df[col] = final_df[col].astype(str) \
            .str.replace('.', '', regex=False) \
            .str.replace(',', '.', regex=False) \
            .str.strip() \
            .apply(lambda x: float(x) if x.replace('.', '', 1).replace('-', '').isdigit() else float('nan'))

# 8. Reset index
final_df.reset_index(drop=True, inplace=True)

# Preview cleaned data
final_df.head(10)
final_df.describe()

Next, I use the same procedure but for a different time frame (2015-2020) of pdf files for which the format is slightly different (different page numbers).

In [None]:
camelot
import pandas as pd
import os
import re

# Path to the folder containing the PDF files
pdf_folder = r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\BA_data\B_Bewerber und Berufsausbildungsstellen\arbeitsagentur_pdfs_Ausbildungen"

# List to collect all DataFrames
all_dfs = []

# Loop through all PDF files in the folder
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        filepath = os.path.join(pdf_folder, file)

        # Extract metadata from filename
        match = re.search(r"ausbildungsstellenmarkt-mit-zkt-(\d{2})-0-(\d{6})", file)
        if not match:
            print(f"⚠️ Skipping: Filename doesn't match expected pattern: {file}")
            continue

        bundesland = match.group(1)
        year = int(match.group(2)[:4])
        month = int(match.group(2)[4:])

        # Filter and assign page range
        if ((year > 2015 and year < 2017) or (year == 2015 and month > 9) or (year == 2017 and month <= 10)):
    
            print(f"✅ Processing (2015–2020): {file} ({bundesland}, {year}-{month:02d})")
            if month in [1, 2]:
                page_range = "8,9,10"
            else:
                page_range = "16,17,18"
        elif ((year == 2017 and month >= 11) or (year == 2018 and month <= 3)):
            print(f"✅ Processing (2015–2020): {file} ({bundesland}, {year}-{month:02d})")
            if month in [1, 2]:
                page_range = "9,10,11"
            else:
                page_range = "16,17,18"
        elif ((year == 2018 and month >= 4) or (year == 2020 and month <= 10)):
            if month in [1, 2]:
                page_range = "8,9,10"
            else:
                page_range = "16,17,18"
        else:
            print(f"❌ Skipping: {file} ({bundesland}, {year}-{month:02d})")
            continue

        # Try reading and combining tables
        try:
            tables = camelot.read_pdf(filepath, pages=page_range, flavor='network', strip_text="\n")

            if len(tables) >= 2:
                dfs = [table.df for table in tables]
                # Grab up to 3 tables
                combined_df = pd.concat(dfs, ignore_index=True)

                # Add metadata
                combined_df["Bundesland"] = bundesland
                combined_df["Year"] = year
                combined_df["Month"] = month
                combined_df["source_file"] = file

                all_dfs.append(combined_df)
            else:
                print(f"⚠️ Skipping {file}: Less than 2 tables found.")

        except Exception as e:
            print(f"❌ Error reading {file}: {e}")

# Combine and export all data
if all_dfs:
    final_df_2 = pd.concat(all_dfs, ignore_index=True)
    print(f"✓ Combined {len(all_dfs)} files. Total rows: {final_df_2.shape[0]}")
    print(final_df_2.head())
    final_df_2.to_csv("combined_all_pdfs_2.csv", index=False)
else:
    print("❗ No valid tables extracted from any PDFs.")




Preview the final data frame:

In [None]:
final_df_2.head(10)

Look at summary statistics of the data frame 

In [None]:
final_df_2.describe(include="all")

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', 'Bundesland', 'Year', 'Month', 'source_file'],
      dtype='object')

Export data frame to csv:

In [66]:
final_df_2 = pd.read_csv("combined_all_pdfs_2.csv")

Next, I clean the data frame, including renaming columns, replacing special characters, extracting BKZ, etc. 

In [None]:
import numpy as np

# Drop columns by their string names
final_df_2.drop(columns=['13', '14'], inplace=True)


# 2. Rename columns properly (based on your screenshots)
final_df_2.columns = [
    "Beruf", 
    "Bewerber_Anzahl_Ins", "Bewerber_V_Ins",
    "Bewerber_Unversorgt_Anzahl", "Bewerber_V_Unversorgt",
    "Ausbildungsstellen_Anzahl", "Ausbildungsstellen_V",
    "Ausbildungsstellen_Unbesetzt_Anzahl", "Ausbidlungsstellen_Unbesetzt_V",
    "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vorjahr", "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vor_Vorjahr",
    "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vorjahr", "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vor_Vorjahr",
    "Bundesland", "Year", "Month", "source_file"
] 

# 3. Replace "x" and "*" with NaN
final_df_2.replace(["x", "*", "-"], np.nan, inplace=True)

# 4. Drop rows where Beruf is NaN or irrelevant
final_df_2 = final_df_2[final_df_2["Beruf"].notna()]

# 5. Extract BKZ code (1-3 digit code at start of Beruf string)
final_df_2["BKZ"] = final_df_2["Beruf"].astype(str).str.extract(r"^(\d{1,3})")

# 6. Keep only rows that have a valid BKZ_2
final_df_2 = final_df_2[final_df_2["BKZ"].notna()]


# 7. Covnert columns to numeric

# List of columns to aggregate
columns_to_aggregate = [ 
    "Bewerber_Anzahl_Ins", "Bewerber_V_Ins",
    "Bewerber_Unversorgt_Anzahl", "Bewerber_V_Unversorgt",
    "Ausbildungsstellen_Anzahl", "Ausbildungsstellen_V",
    "Ausbildungsstellen_Unbesetzt_Anzahl", "Ausbidlungsstellen_Unbesetzt_V",
    "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vorjahr", "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vor_Vorjahr",
    "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vorjahr", "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vor_Vorjahr"
]

for col in columns_to_aggregate:
    if col in final_df_2.columns:
        final_df_2[col] = final_df_2[col].astype(str) \
            .str.replace('.', '', regex=False) \
            .str.replace(',', '.', regex=False) \
            .str.strip() \
            .apply(lambda x: float(x) if x.replace('.', '', 1).replace('-', '').isdigit() else float('nan'))

# 8. Reset index
final_df_2.reset_index(drop=True, inplace=True)

# Preview cleaned data
final_df_2.head(10)
final_df_2.describe()

Now, I combine the data frame extracted from the pdf files for the two different time frames. 

In [68]:
# Combine final_df and final_df_2   

merged = pd.concat([final_df, final_df_2], ignore_index=True)  


Looking at summary statistics: 

In [None]:
merged.describe(include="all")  

Look at first rows of df: 

In [None]:
merged.head(10)

Renaming columns of the pdf data frame so the fit the column names of the data frame that I obtained from the excel files. 

In [None]:
# 2. Rename columns properly 
combined_excel_df.columns = [
    "Beruf", 
    "Bewerber_Anzahl_Ins", "Bewerber_V_Ins",
    "Bewerber_Unversorgt_Anzahl", "Bewerber_V_Unversorgt",
    "Ausbildungsstellen_Anzahl", "Ausbildungsstellen_V",
    "Ausbildungsstellen_Unbesetzt_Anzahl", "Ausbidlungsstellen_Unbesetzt_V",
    "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vorjahr", "gemeldete_Berufsausbildungsstellen_je_Bewerber_Vor_Vorjahr",
    "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vorjahr", "unbesetzte_Berufsausbildungsstellen_je_unversorgter_Bewerber_Vor_Vorjahr",
    "Bundesland", "Year", "Month", "source_file", "BKZ"
] 

# Combine the two DataFrames

Ausbildungsstellen = pd.concat([merged, combined_excel_df], ignore_index=True)

Adjusting variable types:

In [None]:
combined_excel_df.dtypes

In [140]:
Ausbildungsstellen = Ausbildungsstellen[Ausbildungsstellen['BKZ'].astype(str).str.match(r'^\d{3}$')]

Exporting final data frame as csv: 

In [144]:
Ausbildungsstellen.to_csv(r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\Ausbildungsstellen.csv", index=False)

Summary statistics of the final data frame: 

In [None]:
Ausbildungsstellen.describe(include="all")