## The goal of this notebook is to produce an exploitable csv for visualizing the longitudinality of the global questions

# Longitudinal Analysis of Global Survey Questions

This notebook creates a ready-to-use CSV that links each survey question back to its overarching “global” theme and records when it first appeared—perfect for plotting how key questions have evolved over time. We:

1. **Load inputs**  
   - Combined raw responses per commune (`commune_responses_combined_raw.csv`)  
   - Metadata on our top 10 “global” questions (`top_10_QuestionGlobales_NLP.csv`)  
   - Full list of global question labels (`QuestionGlobales_NLP.csv`)

2. **Annotate question metadata**  
   - Extract the first survey year for each global question code  
   - Explode multi-mapped codes to build a mapping dictionary from any sub-question back to its global parent

3. **Link responses to global questions**  
   - Transpose the response table so question codes become rows  
   - Derive a `year` column from each question’s code (e.g., GSB21 → 2021)  
   - Map every question in the dataset to its `quest_glob` parent code

4. **Standardize multilingual labels**  
   - Identify all “spr” columns (label translations)  
   - Replace each label with the respondent’s chosen survey language for consistency

5. **Export for visualization**  
   - Save the final table as `commune_responses_combined.csv`, ready for longitudinal plotting of global question trends


In [None]:
import pandas as pd 
import numpy as np
import re

In [None]:
df_commune_responses_combined = pd.read_csv("../data/commune_responses_combined_raw.csv", index_col=False)
top_10_questions_globales = pd.read_csv("../data/top_10_QuestionGlobales_NLP.csv")
full_question_globale_NLP = pd.read_csv("../data/QuestionGlobales_NLP.csv")

In [None]:
start_col = df_commune_responses_combined.columns.get_loc("GSB23_Q99")
df_commune_responses_combined = df_commune_responses_combined.iloc[:, start_col:]
df_commune_responses_combined.head(1)

In [None]:
# on ajoute une colonne pour chaque question globale avec l'année de la question code_first_question
top_10_questions_globales["first_year"] = top_10_questions_globales["code_first_question"].str.extract(r'GSB(\d{2})').astype(float) + 1900
top_10_questions_globales["first_year"] = top_10_questions_globales["first_year"].apply(lambda x: x if x >= 1950 else x + 100).astype(int)

In [None]:
top_10_questions_globales.head(1)

transpose le df_combined et rajouter 2 colonnes : 1 qui contient l'année du survey et une colonne qui contient l'id unique de la question globale (si y'en a une associée)

In [None]:
df_commune_responses_combined = df_commune_responses_combined.transpose()

In [None]:
df_commune_responses_combined = df_commune_responses_combined.iloc[:, :3197]
df_commune_responses_combined.head()

In [None]:
def extract_first_two_digits(id_str):
    if (id_str.startswith("GSB")) or (id_str.startswith("spr")):
        match = re.search(r'\d{2}', id_str)  # Cherche les deux premiers chiffres
        return match.group(0) if match else None
    else:
        # Si l'ID ne commence pas par "GSB", on cherche 4 chiffres collés
        match = re.search(r'(\d{4})', id_str)  # Cherche un groupe de 4 chiffres
        return match.group(0) if match else None

In [None]:
df_commune_responses_combined["year"] = df_commune_responses_combined.index.map(extract_first_two_digits)
df_commune_responses_combined["year"] = df_commune_responses_combined["year"].apply(
    lambda x: int(x) if pd.notna(x) and len(str(x)) == 4 else (
        (int(x) + 1900) if pd.notna(x) and len(str(x)) == 2 and int(x) >= 50 else (
            (int(x) + 2000) if pd.notna(x) and len(str(x)) == 2 else None
        )
    )
)

df_commune_responses_combined['year'] = df_commune_responses_combined['year'].fillna(-1).astype(int)
df_commune_responses_combined['year'] = df_commune_responses_combined['year'].replace(-1, pd.NA)



df_commune_responses_combined.head()




## add column of global question associate 

In [None]:
top_10_questions_globales.head(1)

In [None]:
# Exploser les valeurs multiples dans `code_other_question`
exploded_top_10 = top_10_questions_globales.assign(
    code_other_question=top_10_questions_globales['code_other_question'].str.split('; ')
).explode('code_other_question')

# Créer le dictionnaire de correspondance pour `code_other_question`
other_question_mapping = exploded_top_10.set_index('code_other_question')['code_first_question'].to_dict()

# Créer self_mapping en accédant directement à la colonne sans set_index
self_mapping = dict(zip(top_10_questions_globales['code_first_question'], top_10_questions_globales['code_first_question']))

# Combiner les deux dictionnaires
mapping_dict = {**other_question_mapping, **self_mapping}

# Afficher le dictionnaire final pour vérification
print(mapping_dict)


In [None]:
df_commune_responses_combined['quest_glob'] = df_commune_responses_combined.index.map(mapping_dict)


In [None]:
df_commune_responses_combined.head()

In [None]:
df_commune_responses_combined = df_commune_responses_combined.T

In [None]:
# Sélectionner toutes les colonnes commençant par "spr"
spr_columns = [col for col in df_commune_responses_combined.columns if col.startswith("spr")]

# Remplacer les valeurs des colonnes "spr" par les valeurs de "GSB23_UserLanguage", sauf pour la ligne "year"
for col in spr_columns:
    df_commune_responses_combined.loc[df_commune_responses_combined.index != 'year', col] = df_commune_responses_combined["GSB23_UserLanguage"]

In [None]:
spr_columns = [col for col in df_commune_responses_combined.columns if 'spr' in col]
print(df_commune_responses_combined[spr_columns])

In [None]:
df_commune_responses_combined.to_csv('../data/commune_responses_combined.csv')