## Cleaning Renten data

This code looks at the Renten data and prepares it for futher analysis

In [None]:
# Import CSV with Rentenddaten

import pandas as pd

pd.set_option('display.max.rows', None)
pd.set_option('display.max.columns', None)

df = pd.read_csv(r"C:\\Users\\jhummels\\OneDrive - DIW Berlin\\Gehlen, Annica's files - retirement-labor-shortages\\Data\\Renten_data\\fdz_output_rentenzugaenge_berufsgruppen.csv")

Preview data

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
# Display unique values in the 'kldb2010_3_akvs' column
unique_values = df["kldb2010_3_akvs"].unique()
print(unique_values)

# Count the number of unique values
unique_count = df["kldb2010_3_akvs"].nunique()
print(unique_count)

In [None]:
# Display unique values in the 'Year' column
unique_values_Year = df["year"].unique()
print(unique_values_Year)

If data for a Berufsgruppe in a specific Bundesland and year is missing, the data frame is organised in a way that the row is not included opposed to having the row included but with NA outputs. The following code makes sure that for each Berufsgruppe, Bundesland and Year there are complete observations and rows excluded from the data set are added back but with NA ouptus then. 

In [None]:
# Get unique values for 'Bundesländer', 'Berufsgruppen', and 'Year'
unique_bundeslaender = df["bland"].unique()  # Replace with the actual column name for Bundesländer
unique_berufsgruppen = df["kldb2010_3_akvs"].unique()  # Replace with the actual column name for Berufsgruppen
unique_years = df["year"].unique()  # Replace with the actual column name for Years

# Create a Cartesian product of all possible combinations
import itertools
import pandas as pd

all_combinations = pd.DataFrame(
    list(itertools.product(unique_bundeslaender, unique_berufsgruppen, unique_years)),
    columns=["bland", "kldb2010_3_akvs", "year"]
)

# Merge the complete combinations with the original DataFrame
complete_df = all_combinations.merge(df, on=["bland", "kldb2010_3_akvs", "year"], how="left")

complete_df.to_csv(r"C:\Users\jhummels\OneDrive - DIW Berlin\Gehlen, Annica's files - retirement-labor-shortages\Data\Renten_data\complete_df_Rente.csv", index=False)

# Display the first few rows of the complete DataFrame
complete_df.head()


In [None]:
complete_df.isnull().sum()	

Plotting missingness of alter_rentenzugang variable by Berufsgruppe

In [None]:
missing_alter_rentenzugang = (
    complete_df.groupby('kldb2010_3_akvs')['alter_rentenzugang']
    .apply(lambda x: x.isnull().sum())
    .sort_values(ascending=False)
)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 100))
missing_alter_rentenzugang.plot(kind='barh')

plt.title('Number of Missing "Alter_Rentenzugang" Values by Beruf')
plt.xlabel('Count of Missing Values')
plt.ylabel('Beruf')
plt.gca().invert_yaxis()  # Optional: highest values on top
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
# Count NaNs by Bundesländer
complete_df.groupby('bland').apply(lambda g: g.isnull().mean())

In [None]:
# Count NaNs by Year
complete_df.groupby('year').apply(lambda g: g.isnull().mean())

In [None]:
# Step 1: Calculate missing rate per Year-Month
year_missing = (
    complete_df.groupby(['year'])['alter_rentenzugang']
    .apply(lambda x: x.isnull().mean())
    .reset_index(name='missing_rate')
)

# Step 3: Missing rate per Bundesland
bundesland_missing = (
    complete_df.groupby('bland')['alter_rentenzugang']
    .apply(lambda x: x.isnull().mean())
    .sort_values(ascending=False)
)

# Step 4: Print summary
print("\nYears with highest missing rates:")
print(year_missing.sort_values('missing_rate', ascending=False).head(12))

print("\nBundesländer with highest missing rates:")
print(bundesland_missing.head(10))

# Step 5: Visualize top 12 months with highest missing rates
import matplotlib.pyplot as plt

top_months = year_missing.sort_values('missing_rate', ascending=False).head(12)

plt.figure(figsize=(10, 6))
plt.barh(top_months['year'], top_months['missing_rate'], color='coral')
plt.xlabel('Missing Rate')
plt.ylabel('Year')
plt.title('Top Years with Highest Missing Rates in "Zugang"')
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Step 6: Visualize top 16 Bundesländer with highest missing rates
top_bundeslaender = bundesland_missing.head(16)

plt.figure(figsize=(8, 6))
top_bundeslaender.plot(kind='barh', color='steelblue')
plt.xlabel('Missing Rate')
plt.ylabel('Bundesland')
plt.title('Top 16 Bundesländer with Highest Missing Rates in "Zugang"')
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

