# Enquête Ménages Déplacements (EMD), Lyon / Aire métropolitaine lyonnée - 2015
EDA on 2015 [EMD Lyon transportation survey](https://data.progedo.fr/studies/doi/10.13144/lil-1023).

Try reading `Questionnaire_EDGT_FaF_Lyon_2015.pdf` with the **correctly formatted questions** using `pdfplumber`.

In [74]:
import fitz  # PyMuPDF
import re

def extract_pymupdf(pdf_path: str):
    return_text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text()
            return_text += text
    return return_text

def strip_control_chars(text):
    # Build a translation table that maps all control characters except tab, newline, carriage return to None
    # Control chars are 0x00-0x1F and 0x7F in ASCII
    # We keep \t (0x09), \n (0x0A), \r (0x0D)
    control_chars = dict.fromkeys(
        c for c in range(0x00, 0x20)
        if c not in (0x09, 0x0A, 0x0D)
    )
    control_chars[0x7F] = None  # DEL character

    return text.translate(control_chars)

import pdfplumber

def extract_pdfplumber(pdf_path: str):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text


pdf_path = "../configs/Lyon/data/Questionnaire_EDGT_FaF_Lyon_2015.pdf"
text_pymupdf = extract_pymupdf(pdf_path)
text_pdfplumber = extract_pdfplumber(pdf_path)

clean_text = strip_control_chars(text_pymupdf)

Next, get variable names from `Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls`

In [38]:
import pandas as pd
from typing import Tuple, List

def get_lyon_vars(file_path: str) -> Tuple[List[str]]:
    """Returns the encoded variable names of the Lyon survey for
            1. Menage = household
            2. Personn = trip purpose
            3. Deplacement = trip properties
            4. Opinion = opinion

    Args:
        file_path (str): path to Lyon variable dictionary .xls file

    Returns:
        tuple(List[str]): tuple of variable names for survey parts
    """
    df = pd.read_excel(file_path, sheet_name=0, header=1)
    exclude_vars = ["Variables", "MP1", "ZFM", "ECH", "DATE"]
    variables = df.Variables.dropna()[~df.Variables.isin(exclude_vars)]

    menage_vars = [var for var in variables if var.startswith("M")]         # MX
    personne_vars = [var for var in variables if var.startswith("P")]       # PX
    deplacement_vars = [var for var in variables if var.startswith("D")]    # DX
    trajet_vars = [var for var in variables if var.startswith("T")]         # TX
    opinion_vars = [var for var in variables if var.startswith("O")]        # OX

    return menage_vars, personne_vars, deplacement_vars, trajet_vars, opinion_vars

file_path = "../configs/Lyon/data/Dessin_fichier_Dictionnaire_variables_EDGT_AML_Face-a-Face_02082015.xls"

lyon_variables = get_lyon_vars(file_path)

Inspecting text both text extraction method, the menage, trajet, and opinion variables look like they can be easily parsed from the text outputs using.

I gave up, let's manually extract questions 

In [116]:
for group in lyon_variables:
    for var in group:
        print(var)

M1
M2
M3
M4
M5
M6
M7A
M8A
M9A
M10A
M11A
M12A
M13A
M7B
M8B
M9B
M10B
M11B
M12B
M13B
M7C
M8C
M9C
M10C
M11C
M12C
M13C
M7D
M8D
M9D
M10D
M11D
M12D
M13D
M14
M15A
M16A
M17A
M18A
M19A
M20A
M15B
M16B
M17B
M18B
M19B
M20B
M15C
M16C
M17C
M18C
M19C
M20C
M15D
M16D
M17D
M18D
M19D
M20D
M21
M22
M23
M24-1
M24-2
M12BIS
MFIN
MODP
MOIP
PP1
PER
P1
P2
P3
P4
P5
P6
P7
P8
P9
P10
P11
P12
P14
P15
P16
P17
P18
P18A
P19
P20
P21
P22
P23
P24
P25
P26
P12A
P24A
P24B
PFIN
PER
PER
PER
DP15
DP1
D2A
D2B
D3
D4
D5A
D5B
D6
D7
D8
D9
D10
D11
D12
DOIB
DIST
DISP
DFIN
D11
TP1
T1
T2
T3
T4
T5
T6
T7
T8
T8A
T8B
T9
T10
T11
T12
T13
T8C
TOIS
TDIS
TDIP
TFIN
OP1
O1A
O1B
O1C
O1D
O1E
O1F
O1G
O2
O3A
O3B
O3C
O3D
O3E
O3F
O3G
O3H
O3I
O4
O5A
O5B
O5C
O5D
O5E
O5F
O5G
O6A
O6B
O6C
O7A
O7B
O7C
O8A
O8B
O8C
O5H
O5I
O9
O10-1
O10-2
O11
O12-1
O12-2
O13
O14-1
O14-2
O14-3
O15
O16
O17-1
O17-2
O17-3
O18-1
O18-2
O18-3
O19-1
O19-2
O19-3
O20
O21
O22-1
O22-2
O23
O24-1
O24-2
O24-3
O25-1
O25-2
O26
O2b
O27-1
O27-2
O27-3
O28-1
O28-2
O29-1
O29-2
O30
O31
O32a
O33
O34
O35-