**Authors**: <br>
    Iryna Savchuk | 20211310 <br>
    Cátia Parrinha | 20201320 <br>
    Gueu...

**Table of Contents** <br>
* [Import Libraries](#importlibraries)
* [Age](#age)
* [Gender](#gender)
* [Category](#category)
* [Age by Gender](#agebygender)
* [Cetegory by Gender](#categorybygender)
* [Gender by Year](#genderbyyear)

<hr>
<a class="anchor" id="importlibraries">
    
# Import libraries
    
</a>

In [112]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
import chart_studio.plotly as py

import geojson
import country_converter as coco # to convert and match country names

In [113]:
df = pd.read_csv('data/merged.csv')

<hr>
<a class="anchor" id="age">
    
# Age
    
</a>

In [7]:
x = df['prizeAge']
hist_data = dict(type='histogram', x=x, marker=dict(color='silver'))
layout = dict(title=dict(text='Ages Distribution'))
fig_1 = go.Figure(data=hist_data, layout=layout)
fig_1.show(renderer='browser')

<hr>
<a class="anchor" id="gender">
    
# Gender
    
</a>

In [8]:
df['gender'].unique()

array(['male', 'female', 'org'], dtype=object)

In [9]:
gender_labels = df['gender'].value_counts()
gender_values = (gender_labels / gender_labels.sum()) * 100
unique_genders = df['gender'].unique()

gender_data = dict(type='pie',
                        labels=unique_genders,
                        values=gender_values,
                        textposition='inside', 
                        hole=0.6,
                        marker=dict(colors=['#333F44', '#37AA9C']) # '#94F3E4'
                        )

gender_layout = dict(title=dict(text='Prizes by Gender')
                  )

gender = go.Figure(data=[gender_data], layout=gender_layout)

gender.show(renderer='browser')

<hr>
<a class="anchor" id="category">
    
# Category 
    
</a>

In [10]:
category_labels = df['category'].value_counts()
category_values = (category_labels / category_labels.sum()) * 100
unique_category = df['category'].unique()

bar_category_data = dict(type='bar',
                        x=unique_category,
                        y=category_values,
                        marker=dict(color=['#333F44', '#4D5A64', '#68757E', '#828F99', '#9DA9B3', '#B7C4CE', '#D2DEE8', '#A9EDE2', '#93d9f2', '#94F3E4']))

bar_category_layout = dict(title=dict(text='Prizes by Category'), xaxis=dict(title='category'), yaxis=dict(title='Percentage'),
                           paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)'
                  )

bar_fig = go.Figure(data=[bar_category_data], layout=bar_category_layout)

bar_fig.show(renderer='browser')

<hr>
<a class="anchor" id="agebygender">
    
# Age by Gender
    
</a>

In [11]:
# Filter data by gender
male_data = df.loc[df['gender'] == 'male', 'prizeAge']
female_data = df.loc[df['gender'] == 'female', 'prizeAge']

# Create traces for each group
male_hist = go.Histogram(x=male_data, name='Male', marker=dict(color='#333F44'))
female_hist = go.Histogram(x=female_data, name='Female', marker=dict(color='#37AA9C'))

# Create layout
layout = go.Layout(title=dict(text='Ages Distribution by Gender'), 
                  paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')

# Create figure and plot
fig_1 = go.Figure(data=[male_hist, female_hist], layout=layout)
fig_1.show(renderer='browser')

In [12]:
fig = px.histogram(df, x = 'prizeAge', color = 'gender', 
                  marginal = 'box', # or violin, rug, box
                  hover_data = df.columns,
                  color_discrete_sequence=['#333F44', '#37AA9C'])

fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')

fig.show(renderer='browser')

In [13]:
fig = go.Figure()
fig.add_trace(go.Histogram(name = 'Male', x=male_data, marker_color = '#333F44'))
fig.add_trace(go.Histogram(name = 'Female', x=female_data, marker_color = '#37AA9C'))

# Overlay both histograms
fig.update_layout(barmode='overlay', plot_bgcolor='rgba(0,0,0,0)')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show(renderer='browser')

In [14]:
# Filter data by gender
male_data = df.loc[df['gender'] == 'male', 'prizeAge']
female_data = df.loc[df['gender'] == 'female', 'prizeAge']

# Group data together
hist_data = [male_data, female_data]

group_labels = ['Male', 'Female']

colors = ['#333F44', '#37AA9C']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, show_hist = False, colors = colors)
# Add title
fig.update_layout(title_text = 'Ages Distribution by Gender',  
                  paper_bgcolor='rgba(0,0,0,0)', 
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show(renderer='browser')

<hr>
<a class="anchor" id="categorybygender">
    
# Category by gender
    
</a>

In [15]:
# Filter data by gender
male_data = df.loc[df['gender'] == 'male', :]
female_data = df.loc[df['gender'] == 'female', :]

# Compute category values for each group
male_category_labels = male_data['category'].value_counts()
female_category_labels = female_data['category'].value_counts()

# Create traces for each group
male_bar = go.Bar(x=male_category_labels.index, y=male_category_labels.values, name='Male', marker=dict(color='#333F44'))
female_bar = go.Bar(x=female_category_labels.index, y=female_category_labels.values, name='Female', marker=dict(color='#37AA9C'))

# Create layout
layout = go.Layout(title=dict(text='Prizes by Category'), 
                   xaxis=dict(title='category'), 
                   yaxis=dict(title='Count'),  
                   paper_bgcolor='rgba(0,0,0,0)', 
                   plot_bgcolor='rgba(0,0,0,0)')

# Create figure and plot
fig = go.Figure(data=[male_bar, female_bar], layout=layout)
fig.show(renderer='browser')

In [16]:
fig = px.histogram(df, x='prizeAge', color='gender',
                   marginal='box', # or violin, rug, box
                   hover_data=df.columns,
                   color_discrete_sequence=['#333F44', '#37AA9C'],
                   facet_col='category')  # add facet_col argument

fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')

fig.show(renderer='browser')

<hr>
<a class="anchor" id="genderbyyear">
    
# Gender by Year
    
</a>

Check the original dataframe

In [116]:
df.head(2)

Unnamed: 0,id,firstname,surname,born,died,bornCountry,bornCountryCode,bornCity,diedCountry,diedCountryCode,...,year,category,overallMotivation,share,motivation,name,city,country,prizeAge,count
0,1,Wilhelm Conrad,Röntgen,1845,1923,Prussia (now Germany),DE,Lennep (now Remscheid),Germany,DE,...,1901,physics,,1,"""in recognition of the extraordinary services ...",Munich University,Munich,Germany,56,1
1,2,Hendrik A.,Lorentz,1853,1928,the Netherlands,NL,Arnhem,the Netherlands,NL,...,1902,physics,,2,"""in recognition of the extraordinary service t...",Leiden University,Leiden,the Netherlands,49,1


Add a new column with 'count' with 1 to have the total of awards count

In [118]:
df['count'] = 1
df.head(2)

Unnamed: 0,id,firstname,surname,born,died,bornCountry,bornCountryCode,bornCity,diedCountry,diedCountryCode,...,year,category,overallMotivation,share,motivation,name,city,country,prizeAge,count
0,1,Wilhelm Conrad,Röntgen,1845,1923,Prussia (now Germany),DE,Lennep (now Remscheid),Germany,DE,...,1901,physics,,1,"""in recognition of the extraordinary services ...",Munich University,Munich,Germany,56,1
1,2,Hendrik A.,Lorentz,1853,1928,the Netherlands,NL,Arnhem,the Netherlands,NL,...,1902,physics,,2,"""in recognition of the extraordinary service t...",Leiden University,Leiden,the Netherlands,49,1


Create a pivot table to have the number of awards by gender and by year

In [120]:
pivot_table = pd.pivot_table(df, values='count', index='year', columns='gender', aggfunc='count', fill_value = 0)
pivot_table

gender,female,male,org
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1901,0,6,0
1902,0,7,0
1903,1,6,0
1904,0,5,1
1905,1,4,0
...,...,...,...
2018,4,9,0
2019,1,13,0
2020,4,7,1
2021,1,12,0


And set the pivot table a a dataframe

In [None]:
df_gender_year = pivot_table.reset_index()
df_gender_year.head()

In [123]:
df_gender_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   year    119 non-null    int64
 1   female  119 non-null    int64
 2   male    119 non-null    int64
 3   org     119 non-null    int64
dtypes: int64(4)
memory usage: 3.8 KB


In [218]:
# Set the columns we want to out plot
year = df_gender_year['year']
female = df_gender_year['female']
male = df_gender_year['male']*(-1)

# Creating instance of the figure
fig = go.Figure()
  
# Adding Male data to the figure
fig.add_trace(go.Bar(y = year, 
                     x = female,
                     name = 'Female', 
                     orientation = 'h',
                     marker=dict(color='red')
                     ))
  
# Adding Female data to the figure
fig.add_trace(go.Bar(y= year, 
                     x = male, 
                     name = 'Male', 
                     orientation = 'h',
                     marker=dict(color='blue')))
  
# Updating the layout for our graph
fig.update_layout(title = 'Gender by Year',
                 title_font_size = 22, barmode = 'overlay',
                 bargap = 0.0, bargroupgap = 0,
                 xaxis = dict(tickvals = [-14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1,
                                          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                                
                              ticktext = [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
                                          0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                                
                              title = 'Gender by Year',
                              title_font_size = 14)
                 )

# Make a horizontal highlight section
fig.add_hrect(y0=1939, y1=1945, 
              annotation_text="II World War", annotation_position='right',  
              annotation_font_size=12,
              annotation_font_color="Black",
              fillcolor="Grey", opacity=0.25)

fig.show(renderer='browser')