# Enquête Ménages Déplacements (EMD), Lyon / Aire métropolitaine lyonnée - 2015
EDA on 2015 [EMD Lyon transportation survey](https://data.progedo.fr/studies/doi/10.13144/lil-1023).

Try reading `Questionnaire_EDGT_FaF_Lyon_2015.pdf` with the **correctly formatted questions** using `pdfplumber`.

Next, get variable names from `Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls`

In [1]:
import pandas as pd
from typing import Tuple, List

config_folder = "../configs/Lyon"
file_path = "../configs/Lyon/data/Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls"

def get_lyon_vars(file_path: str) -> Tuple[List[str]]:
    """Returns the encoded variable names of the Lyon survey for
            1. Menage = household
            2. Personn = trip purpose
            3. Deplacement = trip properties
            4. Opinion = opinion

    Args:
        file_path (str): path to Lyon variable dictionary .xls file

    Returns:
        tuple(List[str]): tuple of variable names for survey parts
    """
    df = pd.read_excel(file_path, sheet_name=0, header=1)
    exclude_vars = ["Variables", "MP1", "ZFM", "ECH", "DATE"]
    variables = df.Variables.dropna()[~df.Variables.isin(exclude_vars)]

    menage_vars = [var for var in variables if var.startswith("M")]         # MX
    personne_vars = [var for var in variables if var.startswith("P")]       # PX
    deplacement_vars = [var for var in variables if var.startswith("D")]    # DX
    trajet_vars = [var for var in variables if var.startswith("T")]         # TX
    opinion_vars = [var for var in variables if var.startswith("O")]        # OX

    return menage_vars, personne_vars, deplacement_vars, trajet_vars, opinion_vars


lyon_variables = get_lyon_vars(file_path)

def get_fiche_responses(excel_path: str):
    # read df, get var name series and var response df
    df = pd.read_excel(excel_path, sheet_name=1)

    # get list of list of variables that corresponds to grouped response options
    var_series = df.iloc[:,0]
    var_series = var_series[~var_series.str.contains("FILTRE", na=False)]

    # group on 'islands' of vars
    mask =var_series.notna()
    groups = (mask != mask.shift()).cumsum()
    var_groups = [group.to_list() for key, group in var_series.groupby(groups) if group.notna().any()]

    # concat and convert groups of vars to one list
    formatted_var_groups = []
    for group in var_groups:
        if len(group) == 1 and not "," in group[0]:
            formatted_var_groups.append(group)
        elif len(group) == 1:
            formatted_group = [var.strip() for var in group[0].split(",")]
            formatted_var_groups.append(formatted_group)
        elif len(group) >= 1:
            formatted_vars = []
            for subgroup in group:
                formatted_subgroup = [var.strip() for var in subgroup.split(",")]
                formatted_vars.extend(formatted_subgroup)
            formatted_var_groups.append(formatted_vars)
        else:
            print(f"Group {group} not formatted")

    # group response options - if correct should be the same
    # length as formatted_var_groups
    response_options_df = df.iloc[:,1:]

    # get mask and index for each group
    mask = response_options_df.iloc[:,1].notna()
    group_id = (mask != mask.shift()).cumsum()
    filtered_groups = response_options_df[mask].groupby(group_id)
    chunks = [group for _, group in filtered_groups]

    return formatted_var_groups, chunks

def df_to_dict(df: pd.DataFrame):
    df = df.dropna()
    try:
        converted_dict = dict(zip(df["FICHE MENAGE"].astype(int).astype(str), df["Unnamed: 2"]))
    except:
        converted_dict = dict(zip(df["FICHE MENAGE"].astype(str), df["Unnamed: 2"]))
    return converted_dict

## EMD Question dictionary

In [4]:
import re
from pathlib import Path

def process_EnqueteMenagesDeplacements(config_folder:str):
    data_path = Path(config_folder) / "data"
    data_dictionary_path = data_path / "Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls"
    questions_path = Path(config_folder) / "questions.csv"

    # questions df
    questions_df = pd.read_csv(questions_path, header=None, names=["var", "question"])

    # variables and responses from data dictionary
    var_groups, chunks = get_fiche_responses(data_dictionary_path)

    formatted_responses = [df_to_dict(chunk) for chunk in chunks]
    question_vars = questions_df.iloc[:,0].to_list()

    query_dictionary = {}
    re_ignore = ["JOURDEP", "M12A"]

    for group_index, var_group in enumerate(var_groups):
        for var in var_group:
            if var not in re_ignore:
                var = re.sub(r'([A-Z]+\d+)(?:[A-Z]$|-\d+$)', r'\1', var)
            if var in question_vars:
                query_dictionary[var] = {
                    "question": questions_df[questions_df["var"] == var]["question"].values[0],
                    "dtype": "TEXT",
                    "response": formatted_responses[group_index]
                }

    return query_dictionary

query_dictionary = process_EnqueteMenagesDeplacements(config_folder=config_folder)

# Population synthesis

Extract rows from [Individus localisés au canton-ou-ville en 2021](https://www.insee.fr/fr/statistiques/8268848) using synthesized Lyon population `lyon_persons.csv` and original `FD_INDCVI_2021.csv` survey.

In [5]:
# get index from population synthesis
data_path = Path(config_folder) / "data"
index_path = data_path / "lyon_persons.csv"
census_path = data_path / "FD_INDCVI_2021.csv"

index_df = pd.read_csv(index_path, sep=";")
tracked_records = index_df.TRACKER.values

records = []

census_df = pd.read_csv(
    census_path,
    chunksize=10240,
    index_col=False,
    sep=";",
    low_memory=False,
    dtype=str
    )

for chunk in census_df:
    filtered_chunk = chunk[chunk.index.isin(tracked_records)]
    if not filtered_chunk.empty:
        records.append(filtered_chunk)
census_df = pd.concat(records)


In [6]:
write_path = data_path / "lyon_FD_INDCVI_2021.csv"
census_df.to_csv(write_path, sep=";")

Check spatial quality

In [None]:
import geopandas as gpd

spatial_path = data_path / "CONTOURS-IRIS_3-0__SHP__FRA_2023-01-01"

spatial_files = [file_path for file_path in spatial_path.rglob("*.shp")]

gdfs = []
for file_path in spatial_files:
    gdf = gpd.read_file(file_path).to_crs(3857)
    gdfs.append(gdf)

concatted = pd.concat(gdfs)

lyon_iris = census_df.IRIS.values

filtered = concatted[concatted.CODE_IRIS.isin(lyon_iris)]

In [None]:
lyon_path = data_path / "metropole-de-lyon_ter_territoire.teriris_latest/ter_territoire_teriris_latest.shp"
lyon_gdf = gpd.read_file(lyon_path).to_crs(3857)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
filtered.plot(ax=ax)
lyon_gdf.plot(color="red", ax=ax, zorder=-1)