In [None]:
import pandas as pd
from thefuzz import fuzz

# Default columns for the municipality dataframe
default_columns_by_municipality = ["Gemeinde", "Kanton", "Bezirk", "Sprachregion"]

# Metadata columns of interest
columns_metadata = [
    "Title (de)",
    "Date",
    "Institution",
    "Theme 1",
    "Theme 2",
    "Theme 3",
    "Vote trigger",
    "Vote Result status",
    "Official status",
    "Legal act type",
    "Vote trigger actor",
]

# Load data
df_metadata = pd.read_csv("cleaned_metadata_per_votum.csv")
df_municipality = pd.read_csv("cleaned_data_by_municipality.csv")

# Metadata dictionary for storing extracted metadata
valid_columns = []
metadata = {}

counter = 0
counter2 = 0
counter3 = 0

for column in df_municipality.columns:
    if column not in default_columns_by_municipality:
        abstimmung_datum = column[:10]
        abstimmung_name = column[11:]
        metadata_by_date = df_metadata[df_metadata["Date"] == abstimmung_datum]

        # Calculate similarity score
        df_metadata["similarity_score"] = df_metadata["Title (de)"].apply(
            lambda title: fuzz.partial_ratio(abstimmung_name, title)
        )
        metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
        valid_columns.append(column)
        
        # Check condition
        if metadata_by_name.shape[0] == 1:
            metadata[column] = metadata_by_name.iloc[0]
            counter += 1
        elif metadata_by_name.shape[0] > 1:
            counter2 += 1
        elif metadata_by_name.shape[0] == 0:
            counter3 += 1
print("Equal to 1 ", counter)
print("More than 1 ", counter2)
print("Equal to 0 ", counter3)

# Filter df_municipality to keep only valid columns
columns_to_keep = default_columns_by_municipality + valid_columns
df_municipality_filtered = df_municipality[columns_to_keep]

# Create separate DataFrames for each metadata type
metadata_dataframes = {}

for metadata_type in [
    "Institution",
    "Theme 1",
    "Theme 2",
    "Theme 3",
    "Vote trigger",
    "Vote Result status",
    "Official status",
    "Legal act type",
    "Vote trigger actor",
]:
    # Create an empty DataFrame with the same columns as df_municipality_filtered
    metadata_df = pd.DataFrame(columns=df_municipality_filtered.columns)

    # Populate the first row with metadata type
    metadata_row = [
        (
            "unknown"
            if col in default_columns_by_municipality
            else metadata.get(col, {}).get(metadata_type, "unknown")
        )
        for col in df_municipality_filtered.columns
    ]
    metadata_df.loc[0] = metadata_row

    # Add 4 empty rows
    for _ in range(4):
        metadata_df.loc[len(metadata_df)] = [None] * len(metadata_df.columns)

    # Replace None with "unknown" for the first row only
    metadata_df.iloc[0] = metadata_df.iloc[0].fillna("unknown")

    # Add the actual data
    for i, row in df_municipality_filtered.iterrows():
        metadata_df.loc[len(metadata_df)] = row.values

    # Save the metadata DataFrame
    metadata_dataframes[metadata_type] = metadata_df

    # Save the DataFrame to a file (optional)
    print(metadata_type)
    metadata_df.to_csv(f"./final_data/voting_{metadata_type}.csv", index=False)

# Metadata DataFrames are now stored in `metadata_dataframes` dictionary


  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_by_name = metadata_by_date[df_metadata["similarity_score"] > 75]
  metadata_b

Equal to 1  426
More than 1  28
Equal to 0  38
Institution
Theme 1
Theme 2
Theme 3
Vote trigger
Vote Result status
Official status
Legal act type
Vote trigger actor
