# Enriching GSB23 Survey Data with Geospatial Mapping & Multilingual Labels

In this notebook, we build on our combined 2023 survey dataset to:

1. **Map municipality responses**  
   - Load municipal boundaries from `municipalities.json`  
   - Match each respondent’s commune ID to its GeoJSON feature  
   - Embed survey answers into the GeoJSON properties  
   - Export `commune_responses.csv` for downstream mapping

2. **Generate human-readable translations**  
   - Clean question labels by removing leading numbers  
   - Use Facebook’s mBART model to translate German labels into English  
   - Cascade translations from English → French → Italian  
   - Mark Romanche translations as “not available” for now

3. **Export the final enriched dataset**  
   - Save the fully geocoded, multilingual DataFrame to `data/combined_df.csv`  

With these steps, you’ll have a ready-to-use file for both geospatial visualization and multilingual reporting.  


In [None]:
import pandas as pd
import json

In [None]:
# Load the data
df_codebook_23 = pd.read_excel("data/Extraction CodeBook - 3. Cleaned.xlsx", sheet_name="2023")
df_gsb_23 = pd.read_excel("data/GSB 2023_V1.xlsx")
df_qg = pd.read_excel("data/QuestionGlobales.xlsx")

In [None]:
df_qg.head(10)

In [None]:
df_codebook_23.head(50)

In [None]:
df_gsb_23.head()

In [None]:
print(df_gsb_23.columns)

In [None]:
df_gsb_23.head(4)
df_gsb_23[df_gsb_23["BFS_2023"] == 5586]

In [None]:
df_gsb_23.shape

data merging 

In [None]:
df_base_questions = df_codebook_23[df_codebook_23['code'].str.contains(r'GSB23_Q\d+\b')]
base_question_ids = df_base_questions['code'].str.extract(r'GSB23_Q(\d+)')[0].astype(int).unique()
df_suffix_questions = df_codebook_23[df_codebook_23['code'].str.contains(r'GSB23_Q\d+_1\b')]
for question_id in base_question_ids:
    df_suffix_questions = df_suffix_questions[~df_suffix_questions['code'].str.contains(f'GSB23_Q{question_id}_1\b')]
df_first_subquestions = pd.concat([df_base_questions, df_suffix_questions], ignore_index=True)
df_first_subquestions.reset_index(drop=True, inplace=True)

df_first_subquestions[df_first_subquestions['num_question'] == 'Q1']

In [None]:
df_first_subquestions[df_first_subquestions["enquete"] == "GSB23"]

In [None]:
df_first_subquestions.shape

In [None]:
# Step 1: Liste des questions supplémentaires à ajouter
extra_questions_codes = [
    'GSB23_Teilnahme', 'GSB23_Mode', 'GSB23_StartDate', 'GSB23_EndDate', 
    'GSB23_Progress', 'GSB23_Duration', 'GSB23_Finished', 'GSB23_Recorded', 
    'GSB23_UserLang', 'GSB23_Q99', 'GSB23_Q100', 'GSB23_Q101'
]

# Step 2: Filtrer le df_codebook pour ces questions
df_extra_questions = df_codebook_23[df_codebook_23['code'].isin(extra_questions_codes)]

# Step 3: Combiner les 10 premières sous-questions avec les questions supplémentaires
df_combined = pd.concat([df_first_subquestions, df_extra_questions])

# Step 4: Réinitialiser les index du DataFrame combiné
df_combined.reset_index(drop=True, inplace=True)

# Step 5: Afficher le DataFrame combiné
df_combined.head(25)

In [None]:
df_combined.shape

In [None]:
df_combined.columns

In [None]:
with open('municipalities.json', encoding="utf-8") as f:
    municipalities_data = json.load(f)

list_of_commune_ids = [feature['properties']['id'] for feature in municipalities_data['features']]
#print(list_of_commune_ids)

# the column GSB23_Q100 contains the commune ids
df_filtered = df_gsb_23[df_gsb_23['GSB23_Q100'].isin(list_of_commune_ids)]

col_of_interest = df_combined['code_original'].tolist()

print(col_of_interest)
# Filtrer seulement les colonnes qui existent dans df_filtered
col_of_interest_filtered = [col for col in col_of_interest if col in df_filtered.columns]

df_commune_responses = df_filtered[col_of_interest_filtered]
df_commune_responses.head()

In [None]:
df_commune_responses = df_commune_responses.loc[:, ~df_commune_responses.columns.duplicated()]

In [None]:
print(df_commune_responses.columns)


In [None]:
for feature in municipalities_data['features']:
    commune_id = feature['properties']['id']
    
    # Filtrer les réponses pour cette commune
    commune_response = df_commune_responses[df_commune_responses['GSB23_Q100'] == commune_id]
    
    if not commune_response.empty:
        # Ajouter la réponse au GeoJSON (par exemple pour une question)
        feature['properties']['response_Q1'] = commune_response['GSB23_Q1_1'].values[0]

df_commune_responses.to_csv('data/commune_responses.csv', index=False, encoding='utf-8')

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
df_combined['text_de'] = df_combined['label'].str.replace(r'^\d+\.\s*', '', regex=True)


model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translation function with mbart 
def translate_mbart(text, source_lang, target_lang):
    tokenizer.src_lang = source_lang
    # encode the text to be translated
    encoded_input = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang],
        max_length=512,
        num_beams=4,  # use beam search for better results
        early_stopping=True
    )
    # decode the generated tokens
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

df_combined['text_en'] = df_combined['text_de'].apply(lambda x: translate_mbart(x, "de_DE", "en_XX"))
df_combined['text_fr'] = df_combined['text_en'].apply(lambda x: translate_mbart(x, "en_XX", "fr_XX"))
df_combined['text_it'] = df_combined['text_en'].apply(lambda x: translate_mbart(x, "en_XX", "it_IT"))

In [None]:
df_combined.head()

In [None]:
df_combined['text_ro'] = 'Translation not available for the moment'

In [None]:
df_combined.to_csv('data/combined_df.csv', index=False, encoding='utf-8')