**Authors**: <br>
    Iryna Savchuk | 20211310 <br>
    Cátia Parrinha | 20201320 <br>
    Gueu...

**Table of Contents** <br>
* [Import Libraries](#importlibraries)
* [Age](#age)
* [Gender](#gender)
* [Category](#category)
* [Age by Gender](#agebygender)
* [Cetegory by Gender](#categorybygender)
* [Gender by Year](#genderbyyear)
* [Category by Year](#categorybyyear)

<hr>
<a class="anchor" id="importlibraries">
    
# Import libraries
    
</a>

In [12]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
import chart_studio.plotly as py

import geojson
import country_converter as coco # to convert and match country names

In [108]:
df = pd.read_csv('data/merged.csv')

Add a new column with 'count' with 1 to have the total of awards count

In [109]:
df['count'] = 1
df['total'] = len(df['count'])
df.head(2)

Unnamed: 0,id,firstname,surname,born,died,bornCountry,bornCountryCode,bornCity,diedCountry,diedCountryCode,...,category,overallMotivation,share,motivation,name,city,country,prizeAge,count,total
0,1,Wilhelm Conrad,Röntgen,1845,1923,Prussia (now Germany),DE,Lennep (now Remscheid),Germany,DE,...,physics,,1,"""in recognition of the extraordinary services ...",Munich University,Munich,Germany,56,1,989
1,2,Hendrik A.,Lorentz,1853,1928,the Netherlands,NL,Arnhem,the Netherlands,NL,...,physics,,2,"""in recognition of the extraordinary service t...",Leiden University,Leiden,the Netherlands,49,1,989


<hr>
<a class="anchor" id="age">
    
# Age
    
</a>

In [7]:
x = df['prizeAge']
hist_data = dict(type='histogram', x=x, marker=dict(color='silver'))
layout = dict(title=dict(text='Ages Distribution'))
fig_1 = go.Figure(data=hist_data, layout=layout)
fig_1.show(renderer='browser')

<hr>
<a class="anchor" id="gender">
    
# Gender
    
</a>

In [8]:
df['gender'].unique()

array(['male', 'female', 'org'], dtype=object)

In [9]:
gender_labels = df['gender'].value_counts()
gender_values = (gender_labels / gender_labels.sum()) * 100
unique_genders = df['gender'].unique()

gender_data = dict(type='pie',
                        labels=unique_genders,
                        values=gender_values,
                        textposition='inside', 
                        hole=0.6,
                        marker=dict(colors=['#333F44', '#37AA9C']) # '#94F3E4'
                        )

gender_layout = dict(title=dict(text='Prizes by Gender')
                  )

gender = go.Figure(data=[gender_data], layout=gender_layout)

gender.show(renderer='browser')

<hr>
<a class="anchor" id="category">
    
# Category 
    
</a>

In [10]:
category_labels = df['category'].value_counts()
category_values = (category_labels / category_labels.sum()) * 100
unique_category = df['category'].unique()

bar_category_data = dict(type='bar',
                        x=unique_category,
                        y=category_values,
                        marker=dict(color=['#333F44', '#4D5A64', '#68757E', '#828F99', '#9DA9B3', '#B7C4CE', '#D2DEE8', '#A9EDE2', '#93d9f2', '#94F3E4']))

bar_category_layout = dict(title=dict(text='Prizes by Category'), xaxis=dict(title='category'), yaxis=dict(title='Percentage'),
                           paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)'
                  )

bar_fig = go.Figure(data=[bar_category_data], layout=bar_category_layout)

bar_fig.show(renderer='browser')

<hr>
<a class="anchor" id="agebygender">
    
# Age by Gender
    
</a>

In [90]:
# Filter data by gender
male_data = df.loc[df['gender'] == 'male', 'prizeAge']
female_data = df.loc[df['gender'] == 'female', 'prizeAge']

# Create traces for each group
male_hist = go.Histogram(x=male_data, name='Male', marker=dict(color='#333F44'))
female_hist = go.Histogram(x=female_data, name='Female', marker=dict(color='#37AA9C'))

# Create layout
layout = go.Layout(title=dict(text='Ages Distribution by Gender'), 
                  paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')

# Create figure and plot
fig_1 = go.Figure(data=[male_hist, female_hist], layout=layout)
fig_1.show(renderer='browser')

In [93]:
fig = px.histogram(df[df['gender'] != 'org'], x = 'prizeAge', color = 'gender', 
                  marginal = 'box', # or violin, rug, box
                  hover_data = df.columns,
                  color_discrete_sequence=['#e4a76c', '#877769'])

fig.update_layout(plot_bgcolor='rgba(0,0,0,0)') 

fig.show()

In [13]:
fig = go.Figure()
fig.add_trace(go.Histogram(name = 'Male', x=male_data, marker_color = '#333F44'))
fig.add_trace(go.Histogram(name = 'Female', x=female_data, marker_color = '#37AA9C'))

# Overlay both histograms
fig.update_layout(barmode='overlay', plot_bgcolor='rgba(0,0,0,0)')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show(renderer='browser')

In [14]:
# Filter data by gender
male_data = df.loc[df['gender'] == 'male', 'prizeAge']
female_data = df.loc[df['gender'] == 'female', 'prizeAge']

# Group data together
hist_data = [male_data, female_data]

group_labels = ['Male', 'Female']

colors = ['#333F44', '#37AA9C']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, show_hist = False, colors = colors)
# Add title
fig.update_layout(title_text = 'Ages Distribution by Gender',  
                  paper_bgcolor='rgba(0,0,0,0)', 
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show(renderer='browser')

<hr>
<a class="anchor" id="categorybygender">
    
# Category by gender
    
</a>

In [15]:
# Filter data by gender
male_data = df.loc[df['gender'] == 'male', :]
female_data = df.loc[df['gender'] == 'female', :]

# Compute category values for each group
male_category_labels = male_data['category'].value_counts()
female_category_labels = female_data['category'].value_counts()

# Create traces for each group
male_bar = go.Bar(x=male_category_labels.index, y=male_category_labels.values, name='Male', marker=dict(color='#333F44'))
female_bar = go.Bar(x=female_category_labels.index, y=female_category_labels.values, name='Female', marker=dict(color='#37AA9C'))

# Create layout
layout = go.Layout(title=dict(text='Prizes by Category'), 
                   xaxis=dict(title='category'), 
                   yaxis=dict(title='Count'),  
                   paper_bgcolor='rgba(0,0,0,0)', 
                   plot_bgcolor='rgba(0,0,0,0)')

# Create figure and plot
fig = go.Figure(data=[male_bar, female_bar], layout=layout)
fig.show(renderer='browser')

In [106]:
fig = px.histogram(df, x='prizeAge', color='gender',
                   marginal='box', # or violin, rug, box
                   hover_data=df.columns,
                   color_discrete_sequence=['#333F44', '#37AA9C'],
                   facet_col='category')  # add facet_col argument

fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')

fig.show(renderer='browser')

In [107]:
#fig = px.sunburst(df, path=['category', 'gender'], values='count',
#                  color='category', hover_data=['gender'],
#                  color_continuous_scale='RdBu',
#                  color_continuous_midpoint=(df['gender'].value_counts()/df['gender'].value_counts().sum()) * 100)
#fig.show()

In [110]:
levels = ['gender', 'category'] # levels used for the hierarchical chart
color_columns = 'count'
value_column = 'count'

def build_hierarchical_dataframe(df, levels, value_column, color_columns=None):
    """
    Build a hierarchy of levels for Sunburst or Treemap charts.

    Levels are given starting from the bottom to the top of the hierarchy,
    ie the last level corresponds to the root.
    """
    df_all_trees = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
    for i, level in enumerate(levels):
        df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
        dfg = df.groupby(levels[i:]).sum()
        dfg = dfg.reset_index()
        df_tree['id'] = dfg[level].copy()
        if i < len(levels) - 1:
            df_tree['parent'] = dfg[levels[i+1]].copy()
        else:
            df_tree['parent'] = 'total'
        df_tree['value'] = dfg[value_column]
        df_tree['color'] = dfg[color_columns]
        df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
    total = pd.Series(dict(id='total', parent='',
                              value=df[value_column].sum(),
                              color=df[color_columns].sum()))
    df_all_trees = df_all_trees.append(total, ignore_index=True)
    return df_all_trees

df_all_trees = build_hierarchical_dataframe(df, levels, value_column, color_columns)
df_all_trees


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,id,parent,value,color
0,female,chemistry,8,8
1,female,economics,2,2
2,female,literature,17,17
3,female,medicine,12,12
4,female,peace,18,18
5,female,physics,4,4
6,male,chemistry,183,183
7,male,economics,90,90
8,male,literature,102,102
9,male,medicine,213,213


In [127]:
fig = go.Figure(go.Sunburst(
    labels=df_all_trees['id'],
    parents=df_all_trees['parent'],
    values=df_all_trees['value'],
    branchvalues='total',
    marker=dict(
        colors=df_all_trees['color'],
        colorscale= 'icefire', # 'icefire', 'oxy', 'tealrose', 'portland'
        cmid=1),
    hovertemplate='<b>%{label} </b> <br> Laureates: %{value}',
    maxdepth=2
    ))
fig.show()

More color scale

In [126]:
from textwrap import wrap
named_colorscales = px.colors.named_colorscales()
print("\n".join(wrap("".join('{:<12}'.format(c) for c in named_colorscales), 96)))

aggrnyl     agsunset    blackbody   bluered     blues       blugrn      bluyl       brwnyl
bugn        bupu        burg        burgyl      cividis     darkmint    electric    emrld
gnbu        greens      greys       hot         inferno     jet         magenta     magma
mint        orrd        oranges     oryel       peach       pinkyl      plasma      plotly3
pubu        pubugn      purd        purp        purples     purpor      rainbow     rdbu
rdpu        redor       reds        sunset      sunsetdark  teal        tealgrn     turbo
viridis     ylgn        ylgnbu      ylorbr      ylorrd      algae       amp         deep
dense       gray        haline      ice         matter      solar       speed       tempo
thermal     turbid      armyrose    brbg        earth       fall        geyser      prgn
piyg        picnic      portland    puor        rdgy        rdylbu      rdylgn      spectral
tealrose    temps       tropic      balance     curl        delta       oxy         edge
hsv     

<hr>
<a class="anchor" id="genderbyyear">
    
# Gender by Year
    
</a>

Create a pivot table to have the number of awards by gender and by year

In [77]:
pivot_table = pd.pivot_table(df, values='count', index='year', columns='gender', aggfunc='count', fill_value = 0)
pivot_table

gender,female,male,org
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1901,0,6,0
1902,0,7,0
1903,1,6,0
1904,0,5,1
1905,1,4,0
...,...,...,...
2018,4,9,0
2019,1,13,0
2020,4,7,1
2021,1,12,0


And set the pivot table a a dataframe

In [78]:
df_gender_year = pivot_table.reset_index()
df_gender_year.head()

gender,year,female,male,org
0,1901,0,6,0
1,1902,0,7,0
2,1903,1,6,0
3,1904,0,5,1
4,1905,1,4,0


In [79]:
df_gender_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   year    119 non-null    int64
 1   female  119 non-null    int64
 2   male    119 non-null    int64
 3   org     119 non-null    int64
dtypes: int64(4)
memory usage: 3.8 KB


In [80]:
# Set the columns we want to our plot
year = df_gender_year['year']
female = df_gender_year['female']
male = df_gender_year['male']*(-1)

# Creating instance of the figure
fig = go.Figure()
  
# Adding Female data to the figure
fig.add_trace(go.Bar(y = year, 
                     x = female,
                     name = 'Female', 
                     orientation = 'h',
                     marker=dict(color='#877769') 
                     ))
  
# Adding Male data to the figure
fig.add_trace(go.Bar(y= year, 
                     x = male, 
                     name = 'Male', 
                     orientation = 'h',
                     marker=dict(color='#e4a76c')))
  
# Updating the layout for our graph
fig.update_layout(title = 'Gender by Year',
                 title_font_size = 22, barmode = 'overlay',
                 bargap = 0.0, bargroupgap = 0,
                 xaxis = dict(tickvals = [-14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1,
                                          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                                
                              ticktext = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
                                          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                                
                              title = 'Gender by Year',
                              title_font_size = 14)
                 )

# Make a horizontal highlight section
fig.add_hrect(y0=1939, y1=1945, 
              annotation_text="II World War", annotation_position='right',  
              annotation_font_size=12,
              annotation_font_color="Black",
              fillcolor="Grey", opacity=0.25)

fig.show()

<hr>
<a class="anchor" id="categorybyyear">
    
# Category by Year
    
</a>

In [160]:
fig = px.scatter(df, x="year", y="category", color = 'category')
fig.update_traces(marker_size=4)
fig.show()

<hr>
<a class="anchor" id="topuniversity">
    
# Top University
    
</a>

In [161]:
levels = ['category', 'name'] # levels used for the hierarchical chart
color_columns = 'count'
value_column = 'count'

def build_hierarchical_dataframe(df, levels, value_column, color_columns=None):
    """
    Build a hierarchy of levels for Sunburst or Treemap charts.

    Levels are given starting from the bottom to the top of the hierarchy,
    ie the last level corresponds to the root.
    """
    df_all_trees = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
    for i, level in enumerate(levels):
        df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
        dfg = df.groupby(levels[i:]).sum()
        dfg = dfg.reset_index()
        df_tree['id'] = dfg[level].copy()
        if i < len(levels) - 1:
            df_tree['parent'] = dfg[levels[i+1]].copy()
        else:
            df_tree['parent'] = 'total'
        df_tree['value'] = dfg[value_column]
        df_tree['color'] = dfg[color_columns]
        df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
    total = pd.Series(dict(id='total', parent='',
                              value=df[value_column].sum(),
                              color=df[color_columns].sum()))
    df_all_trees = df_all_trees.append(total, ignore_index=True)
    return df_all_trees

df_all_trees = build_hierarchical_dataframe(df, levels, value_column, color_columns)
df_all_trees


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,id,parent,value,color
0,chemistry,Aarhus University,1,1
1,chemistry,Asahi Kasei Corporation,1,1
2,chemistry,Berlin University,4,4
3,chemistry,Binghamton University State University of New ...,1,1
4,chemistry,Birmingham University,1,1
...,...,...,...,...
723,École Normale Supérieure,total,1,1
724,École Polytechnique,total,2,2
725,École Supérieure de Physique et Chimie,total,1,1
726,École municipale de physique et de chimie indu...,total,1,1


In [162]:
fig = go.Figure(go.Sunburst(
    labels=df_all_trees['id'],
    parents=df_all_trees['parent'],
    values=df_all_trees['value'],
    branchvalues='total',
    marker=dict(
        colors=df_all_trees['color'],
        colorscale= 'icefire', # 'icefire', 'oxy', 'tealrose', 'portland'
        cmid=1),
    hovertemplate='<b>%{label} </b> <br> Laureates: %{value}',
    maxdepth=2
    ))
fig.show()

In [168]:
# Checking if there are duplicates IDs
# Identify the duplicated rows in the 'id' column
duplicates = df[df['id'].duplicated(keep=False)]

# Group the duplicated rows by their 'id' values and count the number of occurrences
duplicates_summary = duplicates.groupby('id').size().reset_index(name='count')

In [173]:
# Print the summary
id_list = duplicates_summary['id'].tolist()

In [176]:
# Filter the DataFrame based on the id list
filtered_df = df[df['id'].isin(id_list)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,id,firstname,surname,born,died,bornCountry,bornCountryCode,bornCity,diedCountry,diedCountryCode,...,category,overallMotivation,share,motivation,name,city,country,prizeAge,count,total
5,6,Marie,Curie,1867,1934,Russian Empire (now Poland),PL,Warsaw,France,FR,...,physics,,4,"""in recognition of the extraordinary services ...",,,,36,1,989
6,6,Marie,Curie,1867,1934,Russian Empire (now Poland),PL,Warsaw,France,FR,...,chemistry,,1,"""in recognition of her services to the advance...",Sorbonne University,Paris,France,44,1,989
65,66,John,Bardeen,1908,1991,USA,US,Madison WI,USA,US,...,physics,,3,"""for their researches on semiconductors and th...",University of Illinois,Urbana IL,USA,48,1,989
66,66,John,Bardeen,1908,1991,USA,US,Madison WI,USA,US,...,physics,,3,"""for their jointly developed theory of superco...",University of Illinois,Urbana IL,USA,64,1,989
214,217,Linus,Pauling,1901,1994,USA,US,Portland OR,USA,US,...,peace,,1,"""for his fight against the nuclear arms race b...",California Institute of Technology (Caltech),Pasadena CA,USA,61,1,989
215,217,Linus,Pauling,1901,1994,USA,US,Portland OR,USA,US,...,chemistry,,1,"""for his research into the nature of the chemi...",California Institute of Technology (Caltech),Pasadena CA,USA,53,1,989
220,222,Frederick,Sanger,1918,2013,United Kingdom,GB,Rendcombe,United Kingdom,GB,...,chemistry,,1,"""for his work on the structure of proteins esp...",University of Cambridge,Cambridge,United Kingdom,40,1,989
221,222,Frederick,Sanger,1918,2013,United Kingdom,GB,Rendcombe,United Kingdom,GB,...,chemistry,,4,"""for their contributions concerning the determ...",MRC Laboratory of Molecular Biology,Cambridge,United Kingdom,62,1,989
479,482,International Committee of the Red Cross,,1863,0,,,,,,...,peace,,1,"""for the efforts to take care of wounded soldi...",,,,54,1,989
480,482,International Committee of the Red Cross,,1863,0,,,,,,...,peace,,1,"""for the great work it has performed during th...",,,,81,1,989
