# Carte des arrondissements

Test du chrolopleth plotly avec la gEographie des arrondissements de montrEal

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import json

MAP_FPATH = '../assets/maps/arrondissements_montreal.geojson'

with open(MAP_FPATH) as f:
    arrond = json.load(f)
    
print(arrond.keys())
print(arrond['type'])
print(arrond['crs'])
print(arrond['features'])

id = 'no_qr'
fig = go.Figure(
    go.Choroplethmap(
        geojson=arrond,
        featureidkey='properties.'+id,
        locations=[f['properties'][id] for f in arrond['features']],
        z=[f['properties'][id] for f in arrond['features']],
        hovertext=[f['properties']['nom_qr'] for f in arrond['features']],
        colorscale="Viridis",
        marker_opacity=0.5, marker_line_width=0))
fig.update_layout(map=dict(center=dict(lat=45.5517, lon=-73.7073), style="carto-positron", zoom=8.7))
fig.show()

# Districts Electoraux de MontrEal
Il manque tout ce qui n'est pas techniquement partie de la ville de montrEal, e.g. Westmount

In [None]:
import plotly.graph_objects as go
import json

MAP_FPATH = '../assets/maps/districts_montreal.geojson'

with open(MAP_FPATH) as f:
    districts = json.load(f)
    
print(districts.keys())
print(districts['type'])
print(districts['name'])
print(len(districts['features']))
print(districts['features'][0]['geometry'])
print(districts['features'][0].keys())
print(districts['features'][0]['properties'].keys())

id = 'id'
fig = go.Figure(go.Choroplethmap(
    geojson=districts,
    featureidkey='properties.'+id,
    locations=[f['properties'][id] for f in districts['features']],
    z=[f['properties'][id] for f in districts['features']],
    hovertext=[f['properties']['nom'] for f in districts['features']],
    marker_opacity=0.5, marker_line_width=0))
fig.update_layout(
    map=dict(
        center=dict(lat=45.5517, lon=-73.7073), zoom=8.7))
fig.show()

# Superposition des arrondissements et circos

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import json

ARROND_FPATH = '../assets/maps/arrondissements_montreal.geojson'
CIRCO_FPATH = '../assets/maps/districts_montreal.geojson'


with open(ARROND_FPATH) as f:
    arrond = json.load(f)
with open(CIRCO_FPATH) as f:
    districts = json.load(f)    

# En bleu les arrondissements
id = 'no_qr'
fig = go.Figure(
    go.Choroplethmap(
        geojson=arrond,
        featureidkey='properties.'+id,
        locations=[f['properties'][id] for f in arrond['features']],
        z=[0]*len(arrond['features']),
        hovertext=[f['properties']['nom_qr'] for f in arrond['features']],
        marker_opacity=0.5, marker_line_width=5, marker_line_color='blue'))

# En rouge les circonscriptions
id = 'id'
fig = fig.add_trace(go.Choroplethmap(
    geojson=districts,
    featureidkey='properties.'+id,
    locations=[f['properties'][id] for f in districts['features']],
    z=[1]*len(districts['features']),
    hovertext=[f['properties']['nom'] for f in districts['features']],
    colorscale="Viridis",
    marker_opacity=0.5, marker_line_width=5, marker_line_color='red'))
fig.update_layout(
    showlegend=True,
    map=dict(
        center=dict(lat=45.5517, lon=-73.7073), zoom=8.7))
fig.show()

# Test des 125 districts Electoraux du QuEbec

In [None]:
import plotly.graph_objects as go
import numpy as np
import json

MAP_FPATH = '../assets/maps/districts_QC.geojson'

with open(MAP_FPATH) as f:
    districts = json.load(f)
    
districts['features'] = districts['features']
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['ID'] = i

print(districts.keys())
print(districts['type'])
print(districts['name'])
print(districts['crs'])
print(len(districts['features']))
print(districts['features'][0].keys())
for key in districts['features'][0]['properties'].keys():
    print('|\t', key, ':', districts['features'][0]['properties'][key])

id='ID'
fig = go.Figure(go.Choroplethmap(
    geojson=districts,
    featureidkey='properties.'+id,
    locations=[f['properties'][id] for f in districts['features']],
    z=[f['properties'][id] for f in districts['features']],
    hovertext=[f['properties']['NM_CEP'] for f in districts['features']],
    marker_opacity=0.5, marker_line_width=1))

fig.update_geos(
    projection=dict(
        type="conic conformal",
        parallels=[50, 46],
    ))
fig.update_layout(
    map=dict(center=dict(lat=54, lon=-68.5), zoom=3.7),
    width=600, height=800)

fig.show()

## Test du datasets de langues etc

Des donnEes de langue sur l'ensemble du quEbec (enfin les villes principales), juste pour voir comment c'Etait organisE.

In [None]:
import pandas as pd

FPATH =  '../assets/data/langues.csv'
df = pd.read_csv(FPATH, sep=';')

In [None]:
# polars est pas mal plus rapide
import polars as pl

FPATH =  '../assets/data/langues.csv'
df = pl.read_csv(FPATH, separator=';')

# Dropping empty columns
df = df.drop(["Symbole"]+[f"Symbole_duplicated_{i}" for i in range(8)])


In [None]:
print(*df.columns, sep='\n')

In [None]:
print(df.head())

In [None]:
print(*df['GEO'].unique().to_list(), sep='\n')

# Test du dataset de données démographiques par circo

In [None]:
import plotly.graph_objects as go
from unidecode import unidecode
import pandas as pd
import json

MAP_125_FPATH = '../assets/maps/districts_QC.geojson'
DEMOGRAPHICS_FPATH = '../assets/data/donneesSocio2021.csv'

# Get the map
with open(MAP_125_FPATH) as f:
    districts = json.load(f)
print('Number of features :', len(districts['features']))

# Clean the names
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['NM_CEP'] = unidecode(districts['features'][i]['properties']['NM_CEP'])
# Roughly sort. NOT RELIABLE for 1-to-1 matching
districts['features'].sort(key=lambda x: x['properties']['NM_CEP'].lower())
 
# Add unique IDs
for i in range(len(districts['features'])):
    districts['features'][i]['properties']['ID'] = i

# Display the keys
print('Property keys for the map :')
for key in districts['features'][0]['properties'].keys():
    print('|\t', key, ':', type(districts['features'][0]['properties'][key]), '; example :', districts['features'][0]['properties'][key])

# Get the demographics
df = pd.read_csv(DEMOGRAPHICS_FPATH, sep=';')
df.rename(columns={s:unidecode(s) for s in df.columns}, inplace=True)
df.rename(columns={'Circonscription/ DSE 2021':'Circonscription'}, inplace=True)
print('\nDataframe columns :')
#print(*zip(df.columns, df.dtypes), sep='\n')
print(*df.columns, sep='\n')
df.rename(columns={s: s.strip() for s in df.columns}, inplace=True)

# The first row is a recap for the entire province, we drop it and clean
df = df.drop(df[df['Circonscription']=='Province'].index).reset_index(drop=True)
df['Circonscription'] = df['Circonscription'].map(lambda s: unidecode(s))
df['Circonscription'] = df['Circonscription'].map(lambda s: s.strip())
print('Number of districts :', len(df))

# Sort by ID for 1-to-1 matching (faster at runtime)
all_map_districts = [f['properties']['NM_CEP'] for f in districts['features']]
df['ID'] = [all_map_districts.index(circo) for circo in df['Circonscription'].values]
df = df.sort_values(by='ID').reset_index(drop=True)

def get_map(demographics, variable, opacity=0.5):
    z = demographics[variable].values
    if demographics[variable].dtype == 'object':
        z = [float(s[:-1].replace(',', '.')) for s in z]
    print(z)
    fig = go.Figure(go.Choroplethmap(
        geojson=districts,
        featureidkey='properties.ID',
        locations=[f['properties']['ID'] for f in districts['features']],
        z=z,
        hovertext=[f['properties']['NM_CEP'] for f in districts['features']],
        marker_opacity=opacity, marker_line_width=0))
    fig.update_geos(
        projection=dict(
            type="conic conformal",
            parallels=[50, 46]))
    fig.update_layout(
        map=dict(center=dict(lat=54, lon=-68.5), zoom=3.65),
        width=600, height=800)
    return fig


In [None]:
for i, col in enumerate(df.columns):
    print(col, '| example :', df.loc[0, col])

In [None]:
print(*df.dtypes, sep='\n')
print(*df.columns[df.dtypes=='int'], sep='\n')

relevant_columns = [
    'Circonscrition',
    'Population totale selon les groupes d\'age',
    'Age moyen',
    'Age median',
]
    


In [None]:
## Unit tests, Assert that the dataframes are correctly ordered etc

print(len(df))

# Test Double inclusion
for i in range(125):
    assert df.loc[i, 'Circonscription'] in all_map_districts, f'{df.loc[i, "Circonscription"]} not in the map'
    assert all_map_districts[i] in df['Circonscription'].values, f'{districts["features"][i]["properties"]["NM_CEP"]} not in the dataframe'
    
# Test ID matching
for i in range(125):
    id, name = districts['features'][i]['properties']['ID'], districts['features'][i]['properties']['NM_CEP']
    assert name == df.loc[df['ID']==id, 'Circonscription'].values, f'{districts["features"][i]["properties"]["ID"]} not equal to {df.loc[i,"ID"]}'
    
# Test ordering by ID
for i in range(125):
    assert all_map_districts[i] == df.loc[i,'Circonscription'], f'{districts["features"][i]["properties"]["NM_CEP"]} not equal to {df.loc[i,"Circonscription"]}'

In [None]:
var = "Population totale agee de 15 ans et plus dans les menages prives selon le plus haut certificat, diplome ou grade"
var = "Immigrants" 
#var = "Population totale selon les groupes d'age"
fig = get_map(df, var, opacity=0.3)
fig.update_layout(title=var)
fig.show()

# Test du dataset par arrondissement

Conversion des multiples CSVs en un seul gros (`arrondissements.csv`).

In [None]:
import os
import os.path as osp
import numpy as np

DIRPATH = '../assets/data/immigration_extracted_csvs'
csv_fnames = os.listdir(DIRPATH)
arrond_names = np.unique([s.split('_')[0] for s in csv_fnames]).tolist()
arrond_names.pop(arrond_names.index('Le'))
arrond_names.append('Le_Plateau-Mont-Royal')
arrond_names.append('Le_Sud-Ouest')
print(len(arrond_names), arrond_names)

# Name are all bugged, clean them
clean = lambda s: s.strip().replace('Ã©', 'é').replace('Ã´', 'ô').replace('Ã\xa0', 'à').replace('Ã§', 'ç').replace('Ã¢', 'â').replace('Ã¨', 'è')
# Some have more columns bc they include percentage and number. We juste care about the number, we can compute the % ourselves
unify = lambda s: s.replace('Nombre_', '')

# Read and clean
def get_csv(fname):
    df = pd.read_csv(osp.join(DIRPATH, fname), sep=',')
    df.columns = [unify(clean(s)) for s in df.columns]
    df['Catégorie'] = df['Catégorie'].map(clean)
    return df

# Get columns names and values for Montréal globally
col_names = ['Arrondissement']
values_agglo = ['Agglomération de Montréal']
values_ville = ['Ville de Montréal']
candidates = [s for s in csv_fnames if s.startswith(arrond_names[0])]
candidates.sort(key=lambda s: int(s.split('_')[-1].split('.')[0]))
for csv_fname in candidates:
    df = get_csv(csv_fname)
    col_names += df['Catégorie'].values.tolist()
    values_agglo += df['Agglomération de Montréal'].values.tolist()
    values_ville += df['Ville de Montréal'].values.tolist()

# Create the dataframe with the data from mtl
main_df =  pd.DataFrame(
    data = np.array([values_agglo, values_ville]),
    columns=col_names)

# Get values for each arrondissement
for arrond_name in arrond_names:
    values = [arrond_name]

    candidates = [s for s in csv_fnames if s.startswith(arrond_name)]
    candidates.sort(key=lambda s: int(s.split('_')[-1].split('.')[0]))
    for csv_fname in candidates:
        df_arrond = get_csv(csv_fname)
        
        arrond_col_name = df_arrond.columns[-2] if len(df.columns) >= 7 else df_arrond.columns[-1]
        values += df_arrond[arrond_col_name].values.tolist()
    df = pd.DataFrame(
        data = np.array([values]),
        columns=col_names)
    main_df = pd.concat([main_df, df])
    
main_df.to_csv('../assets/data/arrondissements.csv', index=False)

Quelques viz pour le plan

In [None]:
import plotly.express as px
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

to_plot=[
    #'Population totale',
    'Immigrante',
    'Réfugiée',
    'Population résidente non permanente',
    'Continent Asie',
    'Continent Europe',
    'Continent Afrique',
    'Continent Amériques',
    'Continent Oceanie et autres lieux de naissance',
    'Parlant anglais à la maison',
    'Parlant français à la maison',
]

df = pd.read_csv('../assets/data/arrondissements.csv')
df = df.drop([0, 1]).reset_index(drop=True)

fig = make_subplots(rows=len(to_plot), cols=2, shared_xaxes=True, vertical_spacing=0.02, horizontal_spacing=0.1)
for i, col_name in enumerate(to_plot):
    fig.add_trace(go.Bar(
        x=df['Arrondissement'], 
        y=df[col_name], 
        name=col_name, 
        legendgroup=col_name,
        showlegend=True,
        marker_color=plotly.colors.qualitative.Plotly[i]), row=i+1, col=1)
    fig.add_trace(go.Bar(
        x=df['Arrondissement'], 
        y=df[col_name]/df['Population totale'], 
        legendgroup=col_name,
        showlegend=False,
        marker_color=plotly.colors.qualitative.Plotly[i]), row=i+1, col=2)
fig.update_layout(
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1),
    width=1000,
    height=800)
fig.show()