# Enquête Ménages Déplacements (EMD), Lyon / Aire métropolitaine lyonnée - 2015
EDA on 2015 [EMD Lyon transportation survey](https://data.progedo.fr/studies/doi/10.13144/lil-1023).

Try reading `Questionnaire_EDGT_FaF_Lyon_2015.pdf` with the **correctly formatted questions** using `pdfplumber`.

Next, get variable names from `Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls`

In [3]:
import pandas as pd
from typing import Tuple, List

def get_lyon_vars(file_path: str) -> Tuple[List[str]]:
    """Returns the encoded variable names of the Lyon survey for
            1. Menage = household
            2. Personn = trip purpose
            3. Deplacement = trip properties
            4. Opinion = opinion

    Args:
        file_path (str): path to Lyon variable dictionary .xls file

    Returns:
        tuple(List[str]): tuple of variable names for survey parts
    """
    df = pd.read_excel(file_path, sheet_name=0, header=1)
    exclude_vars = ["Variables", "MP1", "ZFM", "ECH", "DATE"]
    variables = df.Variables.dropna()[~df.Variables.isin(exclude_vars)]

    menage_vars = [var for var in variables if var.startswith("M")]         # MX
    personne_vars = [var for var in variables if var.startswith("P")]       # PX
    deplacement_vars = [var for var in variables if var.startswith("D")]    # DX
    trajet_vars = [var for var in variables if var.startswith("T")]         # TX
    opinion_vars = [var for var in variables if var.startswith("O")]        # OX

    return menage_vars, personne_vars, deplacement_vars, trajet_vars, opinion_vars

file_path = "../configs/Lyon/data/Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls"

lyon_variables = get_lyon_vars(file_path)

In [None]:
import ast
from typing import List

def get_fiche_responses(excel_path: str):
    # read df, get var name series and var response df
    df = pd.read_excel(excel_path, sheet_name=1)

    # get list of list of variables that corresponds to grouped response options
    var_series = df.iloc[:,0]
    var_series = var_series[~var_series.str.contains("FILTRE", na=False)]

    # group on 'islands' of vars
    mask =var_series.notna()
    groups = (mask != mask.shift()).cumsum()
    var_groups = [group.to_list() for key, group in var_series.groupby(groups) if group.notna().any()]

    # concat and convert groups of vars to one list
    formatted_var_groups = []
    for group in var_groups:
        if len(group) == 1 and not "," in group[0]:
            formatted_var_groups.append(group)
        elif len(group) == 1:
            formatted_group = [var.strip() for var in group[0].split(",")]
            formatted_var_groups.append(formatted_group)
        elif len(group) >= 1:
            formatted_vars = []
            for subgroup in group:
                formatted_subgroup = [var.strip() for var in subgroup.split(",")]
                formatted_vars.extend(formatted_subgroup)
            formatted_var_groups.append(formatted_vars)
        else:
            print(f"Group {group} not formatted")

    # group response options - if correct should be the same
    # length as formatted_var_groups
    response_options_df = df.iloc[:,1:]

    # get mask and index for each group
    mask = response_options_df.iloc[:,1].notna()
    group_id = (mask != mask.shift()).cumsum()
    filtered_groups = response_options_df[mask].groupby(group_id)
    chunks = [group for _, group in filtered_groups]

    return formatted_var_groups, chunks

formatted_var_groups, chunks = get_fiche_responses(file_path)