# Carte des arrondissements

Test du chrolopleth plotly avec la gEographie des arrondissements de montrEal

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import json

MAP_FPATH = '../assets/maps/arrondissements_montreal.geojson'

with open(MAP_FPATH) as f:
    arrond = json.load(f)
    
print(arrond.keys())
print(arrond['type'])
print(arrond['crs'])
print(arrond['features'])

id = 'no_qr'
fig = go.Figure(
    go.Choroplethmap(
        geojson=arrond,
        featureidkey='properties.'+id,
        locations=[f['properties'][id] for f in arrond['features']],
        z=[f['properties'][id] for f in arrond['features']],
        hovertext=[f['properties']['nom_qr'] for f in arrond['features']],
        colorscale="Viridis",
        marker_opacity=0.5, marker_line_width=0))
fig.update_layout(map=dict(center=dict(lat=45.5517, lon=-73.7073), style="carto-positron", zoom=8.7))
fig.show()

# Districts Electoraux de MontrEal
Il manque tout ce qui n'est pas techniquement partie de la ville de montrEal, e.g. Westmount

In [None]:
import plotly.graph_objects as go
import json

MAP_FPATH = '../assets/maps/districts_montreal.geojson'

with open(MAP_FPATH) as f:
    districts = json.load(f)
    
print(districts.keys())
print(districts['type'])
print(districts['name'])
print(len(districts['features']))
print(districts['features'][0]['geometry'])
print(districts['features'][0].keys())
print(districts['features'][0]['properties'].keys())

id = 'id'
fig = go.Figure(go.Choroplethmap(
    geojson=districts,
    featureidkey='properties.'+id,
    locations=[f['properties'][id] for f in districts['features']],
    z=[f['properties'][id] for f in districts['features']],
    hovertext=[f['properties']['nom'] for f in districts['features']],
    marker_opacity=0.5, marker_line_width=0))
fig.update_layout(
    map=dict(
        center=dict(lat=45.5517, lon=-73.7073), zoom=8.7))
fig.show()

# Superposition des arrondissements et circos

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import json

ARROND_FPATH = '../assets/maps/arrondissements_montreal.geojson'
CIRCO_FPATH = '../assets/maps/districts_montreal.geojson'


with open(ARROND_FPATH) as f:
    arrond = json.load(f)
with open(CIRCO_FPATH) as f:
    districts = json.load(f)    

# En bleu les arrondissements
id = 'no_qr'
fig = go.Figure(
    go.Choroplethmap(
        geojson=arrond,
        featureidkey='properties.'+id,
        locations=[f['properties'][id] for f in arrond['features']],
        z=[0]*len(arrond['features']),
        hovertext=[f['properties']['nom_qr'] for f in arrond['features']],
        marker_opacity=0.5, marker_line_width=5, marker_line_color='blue'))

# En rouge les circonscriptions
id = 'id'
fig = fig.add_trace(go.Choroplethmap(
    geojson=districts,
    featureidkey='properties.'+id,
    locations=[f['properties'][id] for f in districts['features']],
    z=[1]*len(districts['features']),
    hovertext=[f['properties']['nom'] for f in districts['features']],
    colorscale="Viridis",
    marker_opacity=0.5, marker_line_width=5, marker_line_color='red'))
fig.update_layout(
    showlegend=True,
    map=dict(
        center=dict(lat=45.5517, lon=-73.7073), zoom=8.7))
fig.show()

# Test des 125 districts Electoraux du QuEbec

In [None]:
import plotly.graph_objects as go
import numpy as np
import json

MAP_FPATH = '../assets/maps/districts_QC.geojson'

with open(MAP_FPATH) as f:
    districts = json.load(f)
    
districts['features'] = districts['features']
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['ID'] = i

print(districts.keys())
print(districts['type'])
print(districts['name'])
print(districts['crs'])
print(len(districts['features']))
print(districts['features'][0].keys())
for key in districts['features'][0]['properties'].keys():
    print('|\t', key, ':', districts['features'][0]['properties'][key])

id='ID'
fig = go.Figure(go.Choroplethmap(
    geojson=districts,
    featureidkey='properties.'+id,
    locations=[f['properties'][id] for f in districts['features']],
    z=[f['properties'][id] for f in districts['features']],
    hovertext=[f['properties']['NM_CEP'] for f in districts['features']],
    marker_opacity=0.5, marker_line_width=1))

fig.update_geos(
    projection=dict(
        type="conic conformal",
        parallels=[50, 46],
    ))
fig.update_layout(
    map=dict(center=dict(lat=54, lon=-68.5), zoom=3.7),
    width=600, height=800)

fig.show()

## Test du datasets de langues etc

Des donnEes de langue sur l'ensemble du quEbec (enfin les villes principales), juste pour voir comment c'Etait organisE.

In [None]:
import pandas as pd

FPATH =  '../assets/data/langues.csv'
df = pd.read_csv(FPATH, sep=';')

In [None]:
# polars est pas mal plus rapide
import polars as pl

FPATH =  '../assets/data/langues.csv'
df = pl.read_csv(FPATH, separator=';')

# Dropping empty columns
df = df.drop(["Symbole"]+[f"Symbole_duplicated_{i}" for i in range(8)])


In [None]:
print(*df.columns, sep='\n')

In [None]:
print(df.head())

In [None]:
print(*df['GEO'].unique().to_list(), sep='\n')

# Test du dataset de données démographiques par circo

Plus utilisé, le code a été refactor dans `preprocess.py`

In [None]:
import plotly.graph_objects as go
from unidecode import unidecode
import pandas as pd
import json

MAP_125_FPATH = '../assets/maps/districts_QC.geojson'
DEMOGRAPHICS_FPATH = '../assets/data/donneesSocio2021.csv'

# Get the map
with open(MAP_125_FPATH) as f:
    districts = json.load(f)
print('Number of features :', len(districts['features']))

# Clean the names
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['NM_CEP'] = unidecode(districts['features'][i]['properties']['NM_CEP'])
# Roughly sort. NOT RELIABLE for 1-to-1 matching
districts['features'].sort(key=lambda x: x['properties']['NM_CEP'].lower())
 
# Add unique IDs
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['ID'] = i

# Display the keys
print('Property keys for the map :')
for key in districts['features'][0]['properties'].keys():
    print('|\t', key, ':', type(districts['features'][0]['properties'][key]), '; example :', districts['features'][0]['properties'][key])

# Get the demographics
df = pd.read_csv(DEMOGRAPHICS_FPATH, sep=';')
df.rename(columns={s:unidecode(s) for s in df.columns}, inplace=True)
df.rename(columns={'Circonscription/ DSE 2021':'Circonscription'}, inplace=True)
print('\nDataframe columns :')
#print(*zip(df.columns, df.dtypes), sep='\n')
#print(*df.columns, sep='\n')
df.rename(columns={s: s.strip() for s in df.columns}, inplace=True)

# The first row is a recap for the entire province, we drop it and clean
df = df.drop(df[df['Circonscription']=='Province'].index).reset_index(drop=True)
df['Circonscription'] = df['Circonscription'].map(lambda s: unidecode(s))
df['Circonscription'] = df['Circonscription'].map(lambda s: s.strip())
print('Number of districts :', len(df))

# Sort in the same way as the map
df.sort_values(by='Circonscription', inplace=True, key=lambda x: x.str.lower())
df = df.reset_index(drop=True)

def get_map(demographics, variable, opacity=0.5):
    z = demographics[variable].values
    if demographics[variable].dtype == 'object':
        z = [float(s[:-1].replace(',', '.')) for s in z]
    print(z)
    fig = go.Figure(go.Choroplethmap(
        geojson=districts,
        featureidkey='properties.ID',
        locations=[f['properties']['ID'] for f in districts['features']],
        z=z,
        hovertext=[f['properties']['NM_CEP'] for f in districts['features']],
        marker_opacity=opacity, marker_line_width=0))
    fig.update_geos(
        projection=dict(
            type="conic conformal",
            parallels=[50, 46]))
    fig.update_layout(
        map=dict(center=dict(lat=54, lon=-68.5), zoom=3.65),
        width=600, height=800)
    return fig


In [None]:
for i, col in enumerate(df.columns):
    print(col, '| example :', df.loc[0, col])

In [None]:
print(*df.dtypes, sep='\n')
print(*df.columns[df.dtypes=='int'], sep='\n')

relevant_columns = [
    'Circonscrition',
    'Population totale selon les groupes d\'age',
    'Age moyen',
    'Age median',
]
    


In [None]:
## Unit tests, Assert that the dataframes are correctly ordered etc
all_map_districts = [f['properties']['NM_CEP'] for f in districts['features']]
print(len(df))

# Test Double inclusion
for i in range(125):
    assert df.loc[i, 'Circonscription'] in all_map_districts, f'{df.loc[i, "Circonscription"]} not in the map'
    assert all_map_districts[i] in df['Circonscription'].values, f'{districts["features"][i]["properties"]["NM_CEP"]} not in the dataframe'

# Test full ordering
for a,b in zip(all_map_districts, df['Circonscription']):
    assert a == b, f'{a} not equal to {b}'

In [None]:
var = "Population totale agee de 15 ans et plus dans les menages prives selon le plus haut certificat, diplome ou grade"
var = "Immigrants" 
#var = "Population totale selon les groupes d'age"
fig = get_map(df, var, opacity=0.3)
fig.update_layout(title=var)
fig.show()

# Agrégation des datasets par arrondissement

Conversion des multiples CSVs en un seul gros (`arrondissements.csv`).

In [None]:
import os
import os.path as osp
import numpy as np
import pandas as pd

DIRPATH = '../assets/data/immigration_extracted_csvs'
csv_fnames = os.listdir(DIRPATH)
arrond_names = np.unique([s.split('_')[0] for s in csv_fnames]).tolist()
arrond_names.pop(arrond_names.index('Le'))
arrond_names.append('Le_Plateau-Mont-Royal')
arrond_names.append('Le_Sud-Ouest')
print(len(arrond_names), arrond_names)

# Name are all bugged, clean them
def clean(s):
    s = s.replace('â', '\'').replace('É¯', 'ï').replace('names_to_append', 'ï')
    s = s.strip().replace('Ã©', 'é').replace('Ã´', 'ô').replace('Ã\xa0', 'à')
    s = s.replace('Ã§', 'ç').replace('Ã¢', 'â').replace('Ã¨', 'è')
    s = s.replace('&nbsp', '').replace('Ã\x89', 'É').replace('\x80\x99', '')
    return s
# Some have more columns bc they include percentage and number. We juste care about the number, we can compute the % ourselves
def unify(s):
    return s.replace('Nombre_', '')
def get_csv_number(fname):
    return int(fname.split('_')[-1].split('.')[0])


# Read and clean
def get_csv(fname):
    df = pd.read_csv(osp.join(DIRPATH, fname), sep=',')
    df.columns = [unify(clean(s)) for s in df.columns]
    df['Catégorie'] = df['Catégorie'].map(clean)
    return df

# Get columns names and values for Montréal globally
col_names = ['Arrondissement']
values_agglo = ['Agglomération de Montréal']
values_ville = ['Ville de Montréal']
candidates = [s for s in csv_fnames if s.startswith(arrond_names[0])]
candidates.sort(key=lambda s: get_csv_number(s))
for csv_fname in candidates:
    df = get_csv(csv_fname)
    
    # Half-manual modification to the names because it's not displayed inside the CSVs
    names_to_append = df['Catégorie'].values.tolist() 
    if get_csv_number(csv_fname)//3 == 3:
        names_to_append = ["Non permamente, " + name for name in names_to_append]
    elif get_csv_number(csv_fname)//3 == 4:
        names_to_append = ["Âge à l'immigration, " + name for name in names_to_append]
    elif get_csv_number(csv_fname)//3 == 6:
        names_to_append = ["Imigration récente (entre 2016 et 2021), " + name for name in names_to_append]    
    elif get_csv_number(csv_fname)//3 == 11: 
        names_to_append = ["Non permanente, " + name for name in names_to_append]   
        
    if get_csv_number(csv_fname)%3 == 1:
        col_names += [name + ' - Hommes' for name in names_to_append]
    elif get_csv_number(csv_fname)%3 == 2:
        col_names += [name + ' - Femmes' for name in names_to_append]
    else:
        col_names += names_to_append
        
    values_agglo += df['Agglomération de Montréal'].values.tolist()
    values_ville += df['Ville de Montréal'].values.tolist()

# Create the dataframe with the data from mtl
main_df =  pd.DataFrame(
    data = np.array([values_agglo, values_ville]),
    columns=col_names)

# Get values for each arrondissement
for arrond_name in arrond_names:
    values = [arrond_name]

    candidates = [s for s in csv_fnames if s.startswith(arrond_name)]
    candidates.sort(key=lambda s: get_csv_number(s))
    for csv_fname in candidates:
        df_arrond = get_csv(csv_fname)
        
        arrond_col_name = df_arrond.columns[-2] if len(df_arrond.columns) == 7 else df_arrond.columns[-1]
        values += df_arrond[arrond_col_name].values.tolist()
    df = pd.DataFrame(
        data = np.array([values]),
        columns=col_names)
    main_df = pd.concat([main_df, df])

main_df.to_csv('../assets/data/arrondissements.csv', index=False)

# Test du dataset de vote

In [None]:
import pandas as pd
import os
import os.path as osp
import numpy as np
from unidecode import unidecode

FPATH = '../assets/data/resultats.csv'

df = pd.read_csv(FPATH, sep=',')
df['nomCirconscription'] = df['nomCirconscription'].map(lambda s: unidecode(s))
print(*df.columns, sep='\n')
#print(*df['nomCirconscription'].unique(), sep='\n')
#print(len(df['nomCirconscription'].unique()))


In [None]:
import pandas as pd
import json
from unidecode import unidecode
import plotly.graph_objects as go

MAP_125_FPATH = '../assets/maps/districts_QC.geojson'
DEMOGRAPHICS_FPATH = '../assets/data/donneesSocio2021.csv'

# Get the map
with open(MAP_125_FPATH) as f:
    districts = json.load(f)
print('Number of features :', len(districts['features']))

# Clean the names
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['NM_CEP'] = unidecode(districts['features'][i]['properties']['NM_CEP'])
# Roughly sort. NOT RELIABLE for 1-to-1 matching
districts['features'].sort(key=lambda x: x['properties']['NM_CEP'].lower())
 
# Add unique IDs
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['ID'] = i

# Display the keys
print('Property keys for the map :')
for key in districts['features'][0]['properties'].keys():
    print('|\t', key, ':', type(districts['features'][0]['properties'][key]), '; example :', districts['features'][0]['properties'][key])

# Sort by ID for 1-to-1 matching (faster at runtime)
all_map_districts = [f['properties']['NM_CEP'] for f in districts['features']]
all_vote_districts = df['nomCirconscription'].unique().tolist()
all_vote_districts = [unidecode(s) for s in all_vote_districts]
all_vote_districts.sort(key=lambda x: x.lower())

def get_map(demographics, variable, opacity=0.5):
    z = demographics[variable].values
    if demographics[variable].dtype == 'object':
        z = [float(s[:-1].replace(',', '.')) for s in z]
    fig = go.Figure(go.Choroplethmap(
        geojson=districts,
        featureidkey='properties.ID',
        locations=[f['properties']['ID'] for f in districts['features']],
        z=z,
        hovertext=[f['properties']['NM_CEP'] for f in districts['features']],
        marker_opacity=opacity, marker_line_width=0))
    fig.update_geos(
        projection=dict(
            type="conic conformal",
            parallels=[50, 46]))
    fig.update_layout(
        map=dict(center=dict(lat=54, lon=-68.5), zoom=3.65),
        width=600, height=800)
    return fig

In [None]:
df_per_circo = df.groupby('nomCirconscription')
df_per_circo = df_per_circo.mean(numeric_only=True)
for a,b in zip(all_map_districts, df_per_circo['nomCirconscription']):
    assert a == b, f'{a} not equal to {b}'

fig = get_map(df_per_circo, 'tauxParticipation', opacity=0.3)
fig.show()


# Test du code de preprocess

In [None]:

from maps import *
from preprocess import *

map_data = get_map_data()
df = get_demographics_data()

for a,b in zip([f['properties']['NM_CEP'] for f in map_data['features']], df['Circonscription']):
    assert a == b, f'{a} not equal to {b}'

fig = get_map(map_data, df, 'Population totale selon les groupes d\'age', opacity=0.3)
fig.show()