**Note:** When you first open the notebook you may note see all of the visualization since the javascript sometimes breaks and does not show it. Make sure to run all of the cells to see them all. Hope you like it!

In [1]:
import pandas as pd
import altair as alt

import geopandas as gpd # Requires geopandas -- e.g.: conda install -c conda-forge geopandas

# work-around to let Altair handle larger data sets
# This will use an external file to store our data instead of embedding it directly in the
# visualization
alt.data_transformers.enable('json')

import ipywidgets as widgets
from IPython.display import display, clear_output

pass # Don't show any output in this cell

In [2]:
# Choose a data set to work with.

data = pd.read_csv('dpt2020.csv')

In [3]:
# Let's take a look at the data

data.head()


Unnamed: 0,sexe;preusuel;annais;dpt;nombre
0,1;_PRENOMS_RARES;1900;02;7
1,1;_PRENOMS_RARES;1900;04;9
2,1;_PRENOMS_RARES;1900;05;8
3,1;_PRENOMS_RARES;1900;06;23
4,1;_PRENOMS_RARES;1900;07;9


In [4]:
names = pd.read_csv("dpt2020.csv", sep=";")
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)




Let's have a look at the most popular names for each department and year

In [5]:
most_common_names = names.loc[names.groupby('dpt')['nombre'].idxmax()]

most_common_names.head()


Unnamed: 0,sexe,preusuel,annais,dpt,nombre
3049377,2,MARIE,1901,1,732
775010,1,JEAN,1946,2,842
3049473,2,MARIE,1902,3,673
3049380,2,MARIE,1901,4,168
3049381,2,MARIE,1901,5,185


In [6]:
depts = gpd.read_file('departements-version-simplifiee.geojson')


Let's filter the year birth

In [7]:
names['annais'] = pd.to_numeric(names['annais'], errors='coerce')
#filtered_names = names[(names['annais'] >= 2000) & (names['annais'] <= 2010)]

filtered_names = names[(names['annais'] >= 2010) ]
filtered_names.head()


Unnamed: 0,sexe,preusuel,annais,dpt,nombre
10888,1,AAHIL,2016,95,3
11380,1,AARON,2010,1,8
11381,1,AARON,2010,2,14
11382,1,AARON,2010,3,8
11383,1,AARON,2010,5,4


And sum for each department the name occurence through the years 

In [8]:
grouped = filtered_names.groupby(['dpt', 'preusuel', 'sexe'], as_index=False).sum()
grouped.head()

Unnamed: 0,dpt,preusuel,sexe,annais,nombre
0,1,AARON,1,22165,140
1,1,ABDALLAH,1,4030,7
2,1,ABEL,1,4037,6
3,1,ADAM,1,22165,195
4,1,ADEL,1,2015,3


Now we capture the most common name for each departement and put the geodata 

In [9]:
most_common_names = grouped.loc[grouped.groupby('dpt')['nombre'].idxmax()]
grouped = depts.merge(most_common_names, how='right', left_on='code', right_on='dpt') 
grouped.head()

Unnamed: 0,code,nom,geometry,dpt,preusuel,sexe,annais,nombre
0,1,Ain,"POLYGON ((4.78021 46.17668, 4.79458 46.21832, ...",1,LUCAS,1,22165,347
1,2,Aisne,"POLYGON ((4.04797 49.40564, 4.03991 49.39740, ...",2,JULES,1,22165,484
2,3,Allier,"POLYGON ((3.03207 46.79491, 3.04907 46.75808, ...",3,JULES,1,22165,308
3,4,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.69209 44.18648, ...",4,EMMA,2,22165,88
4,5,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.29922 45.10855, ...",5,EMMA,2,22165,113


### Answer to question 2

In [10]:

map = alt.Chart(grouped).mark_geoshape(stroke='white').encode(
    tooltip=['nom', 'code', 'nombre','preusuel'],
    color='nombre',
).properties(width=800, height=600)

map

Instead of looking at the names for each departement, we overlook the departement data, and focus on the birth year

In [11]:
grouped2 = names.groupby(['preusuel', 'sexe','annais'], as_index=False).sum()
grouped2.head()


Unnamed: 0,preusuel,sexe,annais,dpt,nombre
0,AADIL,1,1983,84,3
1,AADIL,1,1992,92,3
2,AAHIL,1,2016,95,3
3,AALIYA,2,2017,75,3
4,AALIYAH,2,2001,92971,9


Popularity of mix name through the time, we picked Camille, but you can pick any name !

In [12]:
camille = grouped2[grouped2['preusuel'] == 'CAMILLE']
camille 


Unnamed: 0,preusuel,sexe,annais,dpt,nombre
38951,CAMILLE,1,1900,0102030405070810111213141516171819212324252627...,1188
38952,CAMILLE,1,1901,0102030708101112131415161718192021232425262728...,1264
38953,CAMILLE,1,1902,0102030407080910111213141516171819212324252627...,1298
38954,CAMILLE,1,1903,0102030405060708091011121314151617181920212324...,1285
38955,CAMILLE,1,1904,0102030506070809101112131415161718192021232425...,1422
...,...,...,...,...,...
39188,CAMILLE,2,2016,0102030506070810111213141516171819202122232425...,2693
39189,CAMILLE,2,2017,0102030405060708091011121314151617181920212223...,2364
39190,CAMILLE,2,2018,0102030607081011121314151617181920212224252627...,2050
39191,CAMILLE,2,2019,0102030607080910111314151617181920212224252627...,1830


In [13]:
alt.Chart(camille).mark_line().encode(
    x='annais',
    y='nombre',
    color='sexe' 
)

In [14]:
chart_homme = alt.Chart(camille[camille['sexe'] == 1]).mark_line(color='blue').encode(
    x='annais',
    y='nombre'
)

chart_femme = alt.Chart(camille[camille['sexe'] == 2]).mark_line(color='red').encode(
    x='annais',
    y='nombre'
)

superposed_chart = alt.layer(chart_homme, chart_femme).resolve_scale(
    #y='independent'
)

superposed_chart

### Answer to question 1

Popularity of names through the time, we took Thomas, Lucas, and Jean but you can pick the names you want !

In [15]:
Thomas = grouped2[grouped2['preusuel'] == 'THOMAS']
Lucas = grouped2[grouped2['preusuel'] == 'LUCAS']
Jean = grouped2[grouped2['preusuel'] == 'JEAN']

chart_Thomas = alt.Chart(Thomas).mark_line(color='blue').encode(
    x='annais',
    y='nombre'
)

chart_Lucas = alt.Chart(Lucas).mark_line(color='red').encode(
    x='annais',
    y='nombre'
)

chart_Jean = alt.Chart(Jean).mark_line(color='green').encode(
    x='annais',
    y='nombre'
)

superposed_chart = alt.layer(chart_Thomas, chart_Lucas, chart_Jean).resolve_scale(
    #y='independent'
)

legend = alt.Chart(pd.DataFrame({
    'preusuel': ['Thomas', 'Lucas', 'Jean'],
    'color': ['blue', 'red', 'green']
})).mark_point(size=10).encode(
    y=alt.Y('preusuel:N', axis=alt.Axis(title='')),
    color=alt.Color('color:N', scale=None)
).properties(
    width=50,
    height=150
)


final_chart = alt.hconcat(
    superposed_chart,
    legend
)

final_chart

### Answer to question 3

Don't mind that some tasks have been repeated in the following cells. run every cell.

In [16]:
names = pd.read_csv("dpt2020.csv", sep=";")
names.drop(names[names.preusuel == '_PRENOMS_RARES'].index, inplace=True)
names.drop(names[names.dpt == 'XX'].index, inplace=True)

names.sample(5)

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
2144295,2,CHRISTELLE,1983,63,30
3245003,2,MIYA,2019,34,7
196459,1,AYLAN,2019,68,4
2519231,2,GERMAINE,1915,9,16
853451,1,JEROME,1968,4,9


In [17]:
# Just to see some names that are common among male and female for testing the visualization
name_counts = names.groupby(['preusuel', 'sexe']).size().reset_index(name='count')

name_pivot = name_counts.pivot(index='preusuel', columns='sexe', values='count').fillna(0)

# identify names that appear for both sexes
common_names = name_pivot[(name_pivot[1] > 0) & (name_pivot[2] > 0)].reset_index()

In [18]:
common_names

sexe,preusuel,1,2
0,ABDON,99.0,11.0
1,ABEL,2979.0,17.0
2,ACHILLE,1564.0,20.0
3,ADAM,2357.0,1.0
4,ADAMA,273.0,112.0
...,...,...,...
863,ZACHARIE,580.0,21.0
864,ZAYANE,1.0,2.0
865,ZOÉ,32.0,2876.0
866,ÉDEN,170.0,25.0


In [19]:
names_sex1 = names[names['sexe'] == 1]
names_sex2 = names[names['sexe'] == 2]

Running the cell below will give the visualization for question3. You can choose the name you want the visualiztation for throught the dropdown menu. The dropdown interactivity has been implemented with ipywidgets instead of Altair since Altair does not support the approach that has been taken here. Basically, each time the a name is chosen through the dropdown menu the visulatization is re-rendered with the new dataframe curated for that name.

In [29]:
# Create a dropdown widget
unique_names = names_sex1['preusuel'].unique().tolist()
dropdown = widgets.Dropdown(options=unique_names, description='Name:')
sexe_mapping = {1: 'Male', 2: 'Female'}

# Define the function to update the chart
def update_chart(name):
    df_edith1 = names_sex1[names_sex1['preusuel'] == name].copy()
    
    df_edith1.loc[:, 'annais'] = pd.to_numeric(df_edith1['annais'], errors='coerce')
    year_bins = [1900, 1905, 1910,1915, 1920,1925, 1930,1935, 1940,1945, 1950,1955, 1960,1965, 1970,1975, 1980,1985, 1990,1995, 2000,2005, 2010,2015, 2020]
    bin_labels = [f'{year_bins[i]}-{year_bins[i+1]}' for i in range(len(year_bins)-1)]
    df_edith1.loc[:, 'year_bin'] = pd.cut(df_edith1['annais'], bins=year_bins, labels=bin_labels, right=False)
    df_edith1['sexe'] = df_edith1['sexe'].map(sexe_mapping)
    df_aggregated1 = df_edith1.groupby(['year_bin', 'sexe'],  observed=False).agg({'nombre': 'sum'}).reset_index()
    
    
    bar1 = alt.Chart(df_aggregated1).mark_bar(opacity=0.8, color='blue').encode(
        x=alt.X('year_bin:O', title='Year Bin'),
        y=alt.Y('nombre:Q', title='Number of People'),
        tooltip=['year_bin', 'nombre', 'sexe']
    )
    
    
    df_edith2 = names_sex2[names_sex2['preusuel'] == name].copy()
    
    df_edith2.loc[:, 'annais'] = pd.to_numeric(df_edith2['annais'], errors='coerce')
    year_bins = [1900, 1905, 1910,1915, 1920,1925, 1930,1935, 1940,1945, 1950,1955, 1960,1965, 1970,1975, 1980,1985, 1990,1995, 2000,2005, 2010,2015, 2020]
    bin_labels = [f'{year_bins[i]}-{year_bins[i+1]}' for i in range(len(year_bins)-1)]
    df_edith2.loc[:, 'year_bin'] = pd.cut(df_edith2['annais'], bins=year_bins, labels=bin_labels, right=False)
    df_edith1['sexe'] = df_edith1['sexe'].map(sexe_mapping)
    df_aggregated2 = df_edith2.groupby(['year_bin', 'sexe'],  observed=False).agg({'nombre': 'sum'}).reset_index()
    
    bar2 = alt.Chart(df_aggregated2).mark_bar(opacity=0.75, color='red').encode(
        x=alt.X('year_bin:O', title='Year Bin'),
        y=alt.Y('nombre:Q', title='Number of People'),
        tooltip=['year_bin', 'nombre', 'sexe']
    )
 
    # Create a horizontal rule to represent the average number of males
#     avg_line_male = alt.Chart(df_edith1).mark_rule(color='blue').encode(
#         y='average(nombre):Q',
#         tooltip=alt.Tooltip('average:Q', title='Average Number of Males')
#     )
    
#     avg_line_female = alt.Chart(df_edith2).mark_rule(color='red').encode(
#     y='average(nombre):Q',
#     tooltip=alt.Tooltip('average(nombre):Q', title='Average Number of Females')
#     )
    
    # Layer the bar charts and the horizontal line
    layered_chart = alt.layer(bar1, bar2).resolve_scale(
        y='shared'
    ).properties(
        title=f"Layered Bar Chart of the number of people with name '{name}' in both genders through time"
    )
    
    # Clear the output area before displaying the new chart
    with output:
        clear_output(wait=True)
        display(layered_chart)

# Create an output widget to display the chart
output = widgets.Output()

# Display the dropdown and set up the observer
dropdown.observe(lambda change: update_chart(change['new']), names='value')
display(dropdown, output)
    
    
# Initial display
update_chart(unique_names[0])

Dropdown(description='Name:', options=('AADIL', 'AAHIL', 'AARON', 'AARONN', 'AARUSH', 'AAYAN', 'AB', 'AB-DEL',…

Output()