In [None]:
# on va créer un nouveau ratio + efficace


performance = total_medailles * [1 - ratio_specialisation]

In [None]:
# indice de Shannon 
# indice de Simpson

In [1]:
import pandas as pd
import numpy as np   # <— NÉCESSAIRE pour np.log

# 1. Charger les données
athletes = pd.read_csv('athlete_events.csv')
noc = pd.read_csv('noc_regions.csv')

# 2. Fusionner et ne garder que les lignes avec médailles
data = pd.merge(athletes, noc, on='NOC', how='left')
data_medals = data.dropna(subset=['Medal'])

# 3. Nombre de médailles par pays et sport
grouped = (
    data_medals
    .groupby(['NOC','Sport'])
    .size()
    .reset_index(name='podium_count')
)

# 4. Total de médailles par pays
totals = grouped.groupby('NOC')['podium_count']\
                .sum()\
                .rename('total_medals')

# 5. Calculer la proportion p_i de chaque sport i
df = grouped.join(totals, on='NOC')
df['p'] = df['podium_count'] / df['total_medals']

# 6. Calcul de l’entropie (indice de Shannon) par pays
shannon = (
    df
    .assign(entropy = lambda d: -d['p'] * np.log(d['p']))
    .groupby('NOC')['entropy']
    .sum()
    .rename('H_shannon')
)

# 7. Normalisation (optionnelle) : H_norm = H / ln(S), avec S = nombre de sports différents
sports_count = grouped.groupby('NOC')['Sport']\
                      .nunique()\
                      .rename('S')
shannon_norm = (
    pd.concat([shannon, sports_count], axis=1)
      .assign(H_norm = lambda d: d['H_shannon'] / np.log(d['S']))
      .reset_index()[['NOC','H_norm']]
)

# 8. Merge dans ton DataFrame final (avec region)
specialisation = (
    pd.merge(
      grouped.groupby('NOC')['podium_count'].max().rename('max_podium'),
      totals.reset_index(),
      on='NOC'
    )
    .merge(shannon_norm, on='NOC', how='left')
    .merge(noc[['NOC','region']], on='NOC', how='left')
)

# 9. Renommer et réorganiser
specialisation['ratio_specialisation'] = specialisation['max_podium'] / specialisation['total_medals']
specialisation = specialisation[[
    'region','max_podium','total_medals','ratio_specialisation','H_norm'
]]
specialisation.rename(columns={'region':'Pays'}, inplace=True)

# 10. Afficher un aperçu
print(specialisation.head())


          Pays  max_podium  total_medals  ratio_specialisation    H_norm
0  Afghanistan           2             2              1.000000       NaN
1      Curacao           1             1              1.000000       NaN
2      Algeria           9            17              0.529412  0.870229
3    Australia          15            29              0.517241  0.656772
4    Argentina          81           274              0.295620  0.733777


In [7]:
pd.reset_option("display.max_rows")

In [12]:
pd.set_option("display.max_rows", None) 

In [8]:
specialisation

Unnamed: 0,Pays,max_podium,total_medals,ratio_specialisation,H_norm
0,Afghanistan,2,2,1.000000,
1,Curacao,1,1,1.000000,
2,Algeria,9,17,0.529412,0.870229
3,Australia,15,29,0.517241,0.656772
4,Argentina,81,274,0.295620,0.733777
...,...,...,...,...,...
144,Vietnam,2,4,0.500000,0.946395
145,Trinidad,5,5,1.000000,
146,Serbia,84,390,0.215385,0.751113
147,Zambia,1,2,0.500000,1.000000


In [13]:
a = specialisation.sort_values(
    by=['ratio_specialisation'],
    ascending = False)
a

Unnamed: 0,Pays,max_podium,total_medals,ratio_specialisation,H_norm
0,Afghanistan,2,2,1.0,
1,Curacao,1,1,1.0,
16,Botswana,1,1,1.0,
10,Barbados,1,1,1.0,
11,Burundi,2,2,1.0,
18,Bahrain,3,3,1.0,
13,Bermuda,1,1,1.0,
60,Individual Olympic Athletes,5,5,1.0,
63,Iraq,1,1,1.0,
66,"Virgin Islands, US",1,1,1.0,
