# Day 79
NOBEL DATA

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:,.2f}'.format
initial_data = pd.read_csv('nobel_prize_data.csv')

In [None]:
#analise do dataframe
#initial_data.shape
#initial_data.columns
#initial_data.info()
#initial_data.describe()
#initial_data.sample()
#initial_data.duplicated().sum()
#initial_data.isna().sum()

initial_data.birth_date = pd.to_datetime(initial_data.birth_date) #conversao de str para datetime

separated_values = initial_data.prize_share.str.split('/', expand=True)
numerator = pd.to_numeric(separated_values[0])
denomenator = pd.to_numeric(separated_values[1])
initial_data['share_pct'] = numerator / denomenator #nova coluna com share pct

In [None]:
#premios entre homens e mulheres
gen = initial_data.sex.value_counts()
pizza = px.pie(labels=gen.index,
             values=gen.values,
             title="Male vs. Female Nobel Winners",
             names=gen.index,
             hole=0.4,)

pizza.update_traces(textposition='inside', textfont_size=15, textinfo='percent')

pizza.show()

In [None]:
#primeiras mulheres a ganhar o nobel
initial_data[initial_data.sex == 'Female'].sort_values('year', ascending=True)[:10]

In [None]:
#vencedores multiplos do NOBEL
is_winner = initial_data.duplicated(subset=['full_name'], keep=False)
multiple_winners = initial_data[is_winner]
print(f'There are {multiple_winners.full_name.nunique()}' \
      ' winners who won more than once.')

subset = ['year', 'category', 'laureate_type', 'full_name']
multiple_winners[subset].sort_values('full_name')

In [None]:
#premios por categoria (ciencia, quimica, etc)
prize_cat = initial_data.category.value_counts()
vertical_bar_graph = px.bar(
        x = prize_cat.index,
        y = prize_cat.values,
        color = prize_cat.values,
        color_continuous_scale='Aggrnyl',
        title='Prizes Awarded per Category')

vertical_bar_graph.update_layout(xaxis_title='Prize Category',
                    coloraxis_showscale=False,
                    yaxis_title='Quantity')
vertical_bar_graph.show()

In [None]:
#premios distribuidos por categoria entre homens e mulheres
categ_gen = initial_data.groupby(['category', 'sex'],as_index=False).agg({'prize': pd.Series.count})
categ_gen.sort_values('prize', ascending=False, inplace=True)
#print(categ_gen)

vertical_graph = px.bar(x = categ_gen.category,
                     y = categ_gen.prize,
                     color = categ_gen.sex,
                     title='Category split by Men and Women')

vertical_graph.update_layout(xaxis_title='Prize Category',
                          yaxis_title='Quantity')
vertical_graph.show()

In [None]:
#calculo de premios dados por ano
prize_year = initial_data.groupby(by='year').count().prize
moving_avg = prize_year.rolling(window=5).mean() #intervalo de 5 em 5 anos
# #scatter graph da dispersao
# plt.scatter(x=prize_year.index,
#            y=prize_year.values,
#            c='darkred',
#            alpha=0.7,
#            s=100,)
# #linha da moving average
# plt.plot(prize_year.index,
#         moving_avg.values,
#         c='lightgreen',
#         linewidth=3,)

# plt.show()

interv = np.arange(1900, 2021, step=5) #faz uma array com os anos intervalados
plt.figure(figsize=(16,8), dpi=200)
plt.title('Prizes Awarded per Year', fontsize=15)
plt.yticks(fontsize=14)
plt.xticks(ticks=interv,
           fontsize=14,
           rotation=45)

axis = plt.gca() #current axis
axis.set_xlim(1900, 2020)

#scatter graph dos premios
axis.scatter(x=prize_year.index,
           y=prize_year.values,
           c='dodgerblue',
           alpha=0.7,
           s=100,)
#linha da media
axis.plot(prize_year.index,
        moving_avg.values,
        c='crimson',
        linewidth=3,)

plt.show()

In [None]:
#share average and comparison to previous data (graph above)
yearly_avg_share = initial_data.groupby(by='year').agg({'share_pct': pd.Series.mean}) #media de premiacao dada ao ano
share_mvg_avg = yearly_avg_share.rolling(window=5).mean() #moving average do intervalo de 5 anos

plt.figure(figsize=(16,8), dpi=200)
plt.title('Prizes Awarded per Year', fontsize=15)
plt.yticks(fontsize=14)
plt.xticks(ticks=interv,
           fontsize=14,
           rotation=45)

axis1 = plt.gca()
axis2 = axis1.twinx() #segundo axis
axis1.set_xlim(1900, 2020)
axis2.invert_yaxis() #invertendo axis

#scatter plots dos premios
axis1.scatter(x=prize_year.index,
           y=prize_year.values,
           c='skyblue',
           alpha=0.7,
           s=100,)
#linha da media
axis1.plot(prize_year.index,
        moving_avg.values,
        c='red',
        linewidth=3,)

#prize line to previous graph
axis2.plot(prize_year.index,
        share_mvg_avg.values,
        c='orange',
        linewidth=3,)

plt.show()

In [None]:
#top PAISES mais premiados
top_countries = initial_data.groupby(['birth_country_current'],as_index=False).agg({'prize': pd.Series.count})

top_countries.sort_values(by='prize', inplace=True)
top15_countries = top_countries[-15:]

horizont_bar_graph = px.bar(x=top15_countries.prize,
                      y=top15_countries.birth_country_current,
                      orientation='h',
                      color=top15_countries.prize,
                      color_continuous_scale='Viridis',
                      title='Top 15 Winner Countries')

horizont_bar_graph.update_layout(xaxis_title='Number of Prizes',
                    yaxis_title='Country',
                    coloraxis_showscale=False)
horizont_bar_graph.show()

In [None]:
#showing the previous data (most winner per country) in a map
df_countries = initial_data.groupby(['birth_country_current', 'ISO'],as_index=False).agg({'prize': pd.Series.count})
df_countries.sort_values('prize', ascending=False) #organizando por qtd

world_map = px.choropleth(df_countries,
                          locations='ISO', #usando a palavra chave para localizacoes
                          color='prize',
                          hover_name='birth_country_current',
                          color_continuous_scale=px.colors.sequential.matter)

world_map.update_layout(coloraxis_showscale=True,)

world_map.show()

In [None]:
#organizando premios ganhos por categoria
country_ctg = initial_data.groupby(['birth_country_current', 'category'],as_index=False).agg({'prize': pd.Series.count})
country_ctg.sort_values(by='prize', ascending=False, inplace=True)

merged_df = pd.merge(country_ctg, top15_countries, on='birth_country_current') #fazendo a mescla das duas tabelas
# change column names
merged_df.columns = ['birth_country_current', 'category', 'cat_prize', 'total_prize']
merged_df.sort_values(by='total_prize', inplace=True)

#horizontal chart atualizado por categorias ganhas por paises
cat_cntry_bar = px.bar(x=merged_df.cat_prize,
                       y=merged_df.birth_country_current,
                       color=merged_df.category,
                       orientation='h',
                       title='Top 15 Countries by Number of Prizes and Category')

cat_cntry_bar.update_layout(xaxis_title='Quantity',
                            yaxis_title='Country')
cat_cntry_bar.show()

In [None]:
#Line chart de premios ganhos organizados por data (bom pra visualizar o timespan e ganho por decadas)
prize_by_year = initial_data.groupby(by=['birth_country_current', 'year'], as_index=False).count()
prize_by_year = prize_by_year.sort_values('year')[['year', 'birth_country_current', 'prize']]

cumulative_prizes = prize_by_year.groupby(by=['birth_country_current','year']).sum().groupby(level=[0]).cumsum()#cumulative sum
cumulative_prizes.reset_index(inplace=True)

line_chart = px.line(cumulative_prizes,
                  x='year',
                  y='prize',
                  color='birth_country_current',
                  hover_name='birth_country_current')

line_chart.update_layout(xaxis_title='Year',
                      yaxis_title='Number of Prizes')

line_chart.show()

In [None]:
#top 15 organizacoes ganhadoras
top15_orgs = initial_data.organization_name.value_counts()[:15]
top15_orgs.sort_values(ascending=True, inplace=True)

org_bar = px.bar(x = top15_orgs.values,
                 y = top15_orgs.index,
                 orientation='h',
                 color=top15_orgs.values,
                 color_continuous_scale=px.colors.sequential.haline,
                 title='Top 15 Research Organizations by Prizes')

org_bar.update_layout(xaxis_title='Quantity',
                      yaxis_title='Institution',
                      coloraxis_showscale=False)
org_bar.show()

In [None]:
#grafico de top cities ganhadoras
top15_org_cities = initial_data.organization_city.value_counts()[:15]
top15_org_cities.sort_values(ascending=True, inplace=True)
cities_h_graph = px.bar(x = top15_org_cities.values,
                  y = top15_org_cities.index,
                  orientation='h',
                  color=top15_org_cities.values,
                  color_continuous_scale=px.colors.sequential.Plasma,
                  title='Most Winner Cities')

cities_h_graph.update_layout(xaxis_title='Quantity',
                       yaxis_title='City',
                       coloraxis_showscale=False)
cities_h_graph.show()

In [None]:
#cidades naturais dos vencedores
top15_born_cities = initial_data.birth_city.value_counts()[:15]
top15_born_cities.sort_values(ascending=True, inplace=True)
city_bar = px.bar(x=top15_born_cities.values,
                  y=top15_born_cities.index,
                  orientation='h',
                  color=top15_born_cities.values,
                  color_continuous_scale=px.colors.sequential.Plasma,
                  title='Nobel Winner Born Cities')

city_bar.update_layout(xaxis_title='Quantity',
                       yaxis_title='City of Birth',
                       coloraxis_showscale=False)
city_bar.show()

In [None]:
#sunburst graph of geography of organizations

country_city_org = initial_data.groupby(by=['organization_country','organization_city','organization_name'],
                                   as_index=False).agg({'prize': pd.Series.count})

country_city_org = country_city_org.sort_values('prize', ascending=False)

burst = px.sunburst(country_city_org,
                    path=['organization_country', 'organization_city', 'organization_name'],
                    values='prize',
                    title='Discoveries and Winner Geography',
                   )

burst.update_layout(xaxis_title='Quantity of Wins',
                    yaxis_title='City',
                    coloraxis_showscale=False)

burst.show()

In [None]:
#obtendo idades dos ganhadores
birth_year = initial_data.birth_date.dt.year
initial_data['winning_age'] = initial_data.year - birth_year #idades quando ganharam salvas na dataframe

initial_data.winning_age.describe()

plt.figure(figsize=(8, 4), dpi=200)
sns.histplot(data=initial_data,
             x=initial_data.winning_age,
             bins=30)
plt.xlabel('Idade')
plt.ylabel('Quantidade')
plt.title('Distribuição da idade de ganhadores')
plt.show()

In [None]:
#scatter + line das idades dos ganhadores
plt.figure(figsize=(8,4), dpi=200)
with sns.axes_style("whitegrid"):
    sns.regplot(data=initial_data,
                x='year', #argumento 1
                y='winning_age', #argumento 2
                lowess=True,
                scatter_kws = {'alpha': 0.4},
                line_kws={'color': 'red'})

plt.show()

In [None]:
#diferenças de idade por categoria em boxplots

plt.figure(figsize=(8,4), dpi=200)
plt.title('Age Spread by Category')
plt.ylabel('Age')
plt.xlabel('Category')
with sns.axes_style("whitegrid"):
    sns.boxplot(data=initial_data,
                x='category',
                y='winning_age')

plt.show()

In [None]:
#scatter graphs of linear regression in ages for the main categories
#GRAFICOS INDIVIDUAIS
# with sns.axes_style('whitegrid'):
#     sns.lmplot(data=initial_data,
#                x='year',
#                y='winning_age',
#                row = 'category',
#                lowess=True,
#                aspect=2,
#                scatter_kws = {'alpha': 0.6},
#                line_kws = {'color': 'lightgreen'},)

# plt.show()

#GRAFICOS UNIDOS HUE ARGUMENT
with sns.axes_style("whitegrid"):
    sns.lmplot(data=initial_data,
               x='year',
               y='winning_age',
               hue='category',
               lowess=True,
               aspect=2,
               scatter_kws={'alpha': 0.5},
               line_kws={'linewidth': 5})

plt.show()