In [507]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [508]:
data_source = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-08-03/athletes.csv")
data_raw = data_source
data_raw.head()

Unnamed: 0,gender,event,medal,athlete,abb,country,grp_id,type,year,guide
0,Men,Double FITA Round Amputee,Gold,LARSEN Finn,DEN,,,Archery,1980,
1,Men,Double FITA Round Amputee,Silver,BRENNE Manfred,FRG,,,Archery,1980,
2,Men,Double FITA Round Amputee,Bronze,SATO Masao,JPN,,,Archery,1980,
3,Men,Double FITA Round Paraplegic,Gold,GEISS H.,FRG,,,Archery,1980,
4,Men,Double FITA Round Paraplegic,Silver,GRUN Guy,BEL,,,Archery,1980,


In [509]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19547 entries, 0 to 19546
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gender   19403 non-null  object 
 1   event    19547 non-null  object 
 2   medal    19547 non-null  object 
 3   athlete  19112 non-null  object 
 4   abb      19498 non-null  object 
 5   country  5119 non-null   object 
 6   grp_id   5119 non-null   float64
 7   type     19547 non-null  object 
 8   year     19547 non-null  int64  
 9   guide    53 non-null     object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.5+ MB


## Explications des variables/colonnes

|variable            |class     |description |
|:-------------------|:---------|:-----------|
|gender              |character | Gender of athlete|
|event               |character | Sports content name|
|medal               |character | Type of medal  |
|athlete             |character | Name of athlete |
|abb                 |character | Abbreviation of the country of athlete |
|country             |character | The country of athlete |
|grp_id              |double    | |
|type                |character | Sport name |
|year                |integer   | Year |
|guide               |character | |

In [510]:
data_raw.describe(include='all')

Unnamed: 0,gender,event,medal,athlete,abb,country,grp_id,type,year,guide
count,19403,19547,19547,19112,19498,5119,5119.0,19547,19547.0,53
unique,3,1670,3,6779,117,137,,11,,42
top,Men,Tournament,Gold,ZORN Trischa,USA,United States of America,,Athletics,,XU Donglin
freq,11982,450,6611,44,1901,504,,7713,,3
mean,,,,,,,3.41512,,1996.723385,
std,,,,,,,3.084167,,11.300594,
min,,,,,,,1.0,,1980.0,
25%,,,,,,,1.0,,1988.0,
50%,,,,,,,2.0,,1996.0,
75%,,,,,,,4.0,,2008.0,


## Réduction de données

In [511]:
data_raw['gender'].unique()

array(['Men', 'Women', 'Mixed', nan], dtype=object)

In [512]:
data_raw['medal'].unique()

array(['Gold', 'Silver', 'Bronze'], dtype=object)

In [513]:
data_raw['year'].unique()

array([1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016],
      dtype=int64)

In [514]:
data_raw['type'].unique()

array(['Archery', 'Athletics', 'Basketball', 'Fencing', 'Powerlifting',
       'Rugby', 'Swimming', 'Table Tennis', 'Wheelchair Tennis',
       'Triathlon', 'Volleyball'], dtype=object)

In [515]:
data_raw['abb'].unique()

array(['DEN', 'FRG', 'JPN', 'BEL', 'GBR', 'CAN', 'NOR', 'FRA', 'MEX',
       'NED', 'FIN', 'SWE', 'USA', 'AUS', 'KOR', 'AUT', 'POL', 'NZL', nan,
       'ESP', 'SUI', 'ITA', 'GER', 'EUN', 'CZE', 'UKR', 'SVK', 'CHN',
       'THA', 'TPE', 'MGL', 'TUR', 'MAS', 'RUS', 'IRI', 'GRE', 'ISR',
       'LUX', 'HKG', 'KUW', 'EGY', 'YUG', 'SUD', 'JAM', 'IRL', 'KEN',
       'BAH', 'ARG', 'ZIM', 'POR', 'BRA', 'BRN', 'ISL', 'BIR', 'HUN',
       'GDR', 'IND', 'JOR', 'TTO', 'URS', 'TUN', 'INA', 'BUL', 'PUR',
       'NGR', 'CUB', 'RSA', 'PAN', 'TCH', 'SLO', 'IRQ', 'VEN', 'LTU',
       'EST', 'IPP', 'CRO', 'ALG', 'URU', 'DOM', 'CIV', 'MDA', 'BLR',
       'UAE', 'LAT', 'PLE', 'ANG', 'RWA', 'MAR', 'AZE', 'SCG', 'BOT',
       'PNG', 'CYP', 'COL', 'KSA', 'PAK', 'NAM', 'SRB', 'SRI', 'ETH',
       'CHI', 'FIJ', 'CPV', 'UGA', 'UZB', 'QAT', 'VIE', 'MOZ', '-', 'LBA',
       'PHI', 'LAO', 'SYR', 'KAZ', 'FRO', 'PER', 'SGP', 'BIH'],
      dtype=object)

In [516]:
data_raw['country'].unique()

array([nan, 'FR Germany', 'Sweden', '-', 'Belgium', 'Great Britain',
       'Netherlands', 'France', 'Australia', 'Korea', 'Finland', 'Italy',
       'Japan', 'Germany', 'Spain', 'Poland', 'United States of America',
       'China', 'Czech Republic', 'Russia', 'Iran', 'Mexico', 'Denmark',
       'Canada', 'Switzerland', 'Portugal', 'Ireland', 'Austria',
       'Hong Kong', 'Thailand', 'Belarus', 'Cuba', 'Venezuela', 'Ukraine',
       'Tunisia', 'Brazil', 'Azerbaijan', 'South Africa', 'Uzbekistan',
       'Colombia', 'Israel', 'Hungary', 'New Zealand',
       'PAWLOWSKI Arkadiusz', 'OWCZAREK Miroslaw', 'FLORES Francisco',
       'MCISAAC Timothy', 'HEGLE Scott', 'KARLSSON Hakan', 'WINGER Dean',
       'FIELDING Petter', 'BERNER Sixten', 'COLLINS-SIMPSON G.',
       'GUDGEON Gary', 'BIELA Grzegorz', 'VAN VLIET Gerard',
       'PAETZOLD Kurt', 'GRONSKY Roman', 'KERS M.', 'VAN BUITEN Andre',
       'KOZAK Janusz', 'ROBESON Scott', 'OLSSON Anders', 'OLSSON Lars',
       'MACHOWCZYK Ryszard'

In [517]:
dropped_columns = [ # les colonnes à retirer
    'guide',
    'grp_id'
]

In [518]:
data_raw = data_raw.drop(columns=dropped_columns)

In [519]:
data_raw.columns

Index(['gender', 'event', 'medal', 'athlete', 'abb', 'country', 'type',
       'year'],
      dtype='object')

In [520]:
data_raw.describe(include='all')

Unnamed: 0,gender,event,medal,athlete,abb,country,type,year
count,19403,19547,19547,19112,19498,5119,19547,19547.0
unique,3,1670,3,6779,117,137,11,
top,Men,Tournament,Gold,ZORN Trischa,USA,United States of America,Athletics,
freq,11982,450,6611,44,1901,504,7713,
mean,,,,,,,,1996.723385
std,,,,,,,,11.300594
min,,,,,,,,1980.0
25%,,,,,,,,1988.0
50%,,,,,,,,1996.0
75%,,,,,,,,2008.0


In [521]:
print(data_raw['athlete'])

0                          LARSEN Finn
1                       BRENNE Manfred
2                           SATO Masao
3                             GEISS H.
4                             GRUN Guy
                     ...              
19542           DE ALMEIDA SILVA Nurya
19543         DE OLIVEIRA DIAS Edwarda
19544    MARIA LEIRIA DE CASTRO Camila
19545                   PEREIRA Pamela
19546         RODRIGUES BATISTA Laiana
Name: athlete, Length: 19547, dtype: object


In [522]:
data_athlete_name = []
data_country_name = []
data_abb_name = []
for name in data_raw['athlete']:
    if(pd.isna(name) or name == '-'):
        data_athlete_name.append('-')
    else:
        data_athlete_name.append(name)
for abb in data_raw['abb']:
    if(pd.isna(abb) or name == '-'):
        data_abb_name.append('-')
    else:
        data_abb_name.append(abb)
for country in data_raw['country']:
    if(pd.isna(country) or name == '-'):
        data_country_name.append('-')
    else:
        data_country_name.append(country)

In [523]:
abb_code = ['-']
country_name = ['UNKNOWN']
for i in range(len(data_abb_name)):
    if data_abb_name[i] != '-' and data_country_name[i] != '-' and (data_abb_name[i] not in abb_code):
        abb_code.append(data_abb_name[i])
        country_name.append(data_country_name[i])
for i in range(len(data_abb_name)):
    if data_abb_name[i] != '-' and data_country_name[i] == '-' and (data_abb_name[i] not in abb_code):
        abb_code.append(data_abb_name[i])
        country_name.append('-')
country_name[abb_code.index('MGL')] = 'Mongolia'
country_name[abb_code.index('MAS')] = 'Malaysia'
country_name[abb_code.index('SUD')] = 'Sudan'
country_name[abb_code.index('GRE')] = 'Greece'
country_name[abb_code.index('LUX')] = 'Luxembourg'
country_name[abb_code.index('BRN')] = 'Bahrain'
country_name[abb_code.index('ISL')] = 'Iceland'
country_name[abb_code.index('BIR')] = 'Burma'
country_name[abb_code.index('GDR')] = 'East Germany'
country_name[abb_code.index('IND')] = 'India'
country_name[abb_code.index('TTO')] = 'Trinidad and Tobago'
country_name[abb_code.index('KEN')] = 'Kenya'
country_name[abb_code.index('BAH')] = 'Bahamas'
country_name[abb_code.index('ARG')] = 'Argentina'
country_name[abb_code.index('ZIM')] = 'Zimbabwe'
country_name[abb_code.index('INA')] = 'Indonesia'
country_name[abb_code.index('BUL')] = 'Bulgaria'
country_name[abb_code.index('PUR')] = 'Puerto Rico'
country_name[abb_code.index('PAN')] = 'Panama'
country_name[abb_code.index('SLO')] = 'Slovenia'
country_name[abb_code.index('IRQ')] = 'Iraq'
country_name[abb_code.index('LTU')] = 'Lithuania'
country_name[abb_code.index('EST')] = 'Estonia'
country_name[abb_code.index('IPP')] = 'Serbia and Montenegro'
country_name[abb_code.index('SGP')] = 'Singapore'
country_name[abb_code.index('IPP')] = 'Serbia'
country_name[abb_code.index('PER')] = 'Peru'
country_name[abb_code.index('FRO')] = 'Faroe Islands'
country_name[abb_code.index('KAZ')] = 'Kazakhstan'
country_name[abb_code.index('SYR')] = 'Syria'
country_name[abb_code.index('LAO')] = 'Laos'
country_name[abb_code.index('PHI')] = 'Philippines'
country_name[abb_code.index('LBA')] = 'Libya'
country_name[abb_code.index('MOZ')] = 'Mozambique'
country_name[abb_code.index('QAT')] = 'Qatar'
country_name[abb_code.index('URU')] = 'Uruguay'
country_name[abb_code.index('DOM')] = 'Dominican Republic'
country_name[abb_code.index('CIV')] = 'Côte Ivoire'
country_name[abb_code.index('MDA')] = 'Moldova'
country_name[abb_code.index('UAE')] = 'United Arab Emirates'
country_name[abb_code.index('LAT')] = 'Latvia'
country_name[abb_code.index('PLE')] = 'Palestine'
country_name[abb_code.index('ANG')] = 'Angola'
country_name[abb_code.index('RWA')] = 'Rwanda'
country_name[abb_code.index('MAR')] = 'Morocco'
country_name[abb_code.index('KSA')] = 'Saudi Arabia'
country_name[abb_code.index('PAK')] = 'Pakistan'
country_name[abb_code.index('NAM')] = 'Namibia'
country_name[abb_code.index('CYP')] = 'Cyprus'
country_name[abb_code.index('PNG')] = 'Papua New Guinea'
country_name[abb_code.index('BOT')] = 'Botswana'
country_name[abb_code.index('SCG')] = 'Serbia'
country_name[abb_code.index('UGA')] = 'Uganda'
country_name[abb_code.index('CPV')] = 'Cabo Verde'
country_name[abb_code.index('FIJ')] = 'Fiji'
country_name[abb_code.index('CHI')] = 'Chile'
country_name[abb_code.index('ETH')] = 'Ethiopia'
country_name[abb_code.index('SRI')] = 'Sri Lanka'
country_name[abb_code.index('ALG')] = 'Algeria'
country_name[abb_code.index('KUW')] = 'Kuwait'
country_name[abb_code.index('NOR')] = 'Norway'
country_name[abb_code.index('VIE')] = 'Vietnam'
print(abb_code)
print(country_name)

['-', 'FRG', 'SWE', 'BEL', 'GBR', 'NED', 'FRA', 'AUS', 'KOR', 'FIN', 'ITA', 'JPN', 'GER', 'ESP', 'POL', 'USA', 'CHN', 'CZE', 'RUS', 'IRI', 'MEX', 'DEN', 'CAN', 'SUI', 'POR', 'IRL', 'AUT', 'HKG', 'THA', 'BLR', 'CUB', 'VEN', 'UKR', 'TUN', 'BRA', 'AZE', 'RSA', 'UZB', 'COL', 'ISR', 'HUN', 'NZL', 'TCH', 'KUW', 'YUG', 'NOR', 'URS', 'JAM', 'SVK', 'TPE', 'NGR', 'JOR', 'TUR', 'CRO', 'SRB', 'BIH', 'EGY', 'EUN', 'MGL', 'MAS', 'GRE', 'LUX', 'SUD', 'KEN', 'BAH', 'ARG', 'ZIM', 'BRN', 'ISL', 'BIR', 'GDR', 'IND', 'TTO', 'INA', 'BUL', 'PUR', 'PAN', 'SLO', 'IRQ', 'LTU', 'EST', 'IPP', 'ALG', 'URU', 'DOM', 'CIV', 'MDA', 'UAE', 'LAT', 'PLE', 'ANG', 'RWA', 'MAR', 'SCG', 'BOT', 'PNG', 'CYP', 'KSA', 'PAK', 'NAM', 'SRI', 'ETH', 'CHI', 'FIJ', 'CPV', 'UGA', 'QAT', 'VIE', 'MOZ', 'LBA', 'PHI', 'LAO', 'SYR', 'KAZ', 'FRO', 'PER', 'SGP']
['UNKNOWN', 'FR Germany', 'Sweden', 'Belgium', 'Great Britain', 'Netherlands', 'France', 'Australia', 'Korea', 'Finland', 'Italy', 'Japan', 'Germany', 'Spain', 'Poland', 'United Stat

In [524]:
def abb_to_name(abb_input):
    if abb_input not in abb_code:
        return 'UNKNOWN'
    else:
        return country_name[abb_code.index(abb_input)]

In [553]:
for i in range(len(data_athlete_name)):        
    if data_athlete_name[i] == '-':
        data_athlete_name[i] = data_country_name[i]
    data_country_name[i] = abb_to_name(data_abb_name[i])
    if data_abb_name[i] == '-':
        data_abb_name[i] = 'UNKNOWN'

In [554]:
data_raw['new_country'] = data_country_name
data_raw['new_abb'] = data_abb_name
data_raw['new_athlete'] = data_athlete_name
data_raw

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete,new_country,new_abb,new_athlete
0,Men,Double FITA Round Amputee,Gold,Archery,1980,Denmark,DEN,LARSEN Finn,Denmark,DEN,LARSEN Finn
1,Men,Double FITA Round Amputee,Silver,Archery,1980,FR Germany,FRG,BRENNE Manfred,FR Germany,FRG,BRENNE Manfred
2,Men,Double FITA Round Amputee,Bronze,Archery,1980,Japan,JPN,SATO Masao,Japan,JPN,SATO Masao
3,Men,Double FITA Round Paraplegic,Gold,Archery,1980,FR Germany,FRG,GEISS H.,FR Germany,FRG,GEISS H.
4,Men,Double FITA Round Paraplegic,Silver,Archery,1980,Belgium,BEL,GRUN Guy,Belgium,BEL,GRUN Guy
...,...,...,...,...,...,...,...,...,...,...,...
19542,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE ALMEIDA SILVA Nurya,Brazil,BRA,DE ALMEIDA SILVA Nurya
19543,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE OLIVEIRA DIAS Edwarda,Brazil,BRA,DE OLIVEIRA DIAS Edwarda
19544,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,MARIA LEIRIA DE CASTRO Camila,Brazil,BRA,MARIA LEIRIA DE CASTRO Camila
19545,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,PEREIRA Pamela,Brazil,BRA,PEREIRA Pamela


In [555]:
dropped_old_columns = [ # les colonnes à retirer
    'athlete',
    'country',
    'abb'
]
data_raw = data_raw.drop(columns=dropped_old_columns)
data_raw

Unnamed: 0,gender,event,medal,type,year,new_country,new_abb,new_athlete
0,Men,Double FITA Round Amputee,Gold,Archery,1980,Denmark,DEN,LARSEN Finn
1,Men,Double FITA Round Amputee,Silver,Archery,1980,FR Germany,FRG,BRENNE Manfred
2,Men,Double FITA Round Amputee,Bronze,Archery,1980,Japan,JPN,SATO Masao
3,Men,Double FITA Round Paraplegic,Gold,Archery,1980,FR Germany,FRG,GEISS H.
4,Men,Double FITA Round Paraplegic,Silver,Archery,1980,Belgium,BEL,GRUN Guy
...,...,...,...,...,...,...,...,...
19542,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE ALMEIDA SILVA Nurya
19543,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE OLIVEIRA DIAS Edwarda
19544,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,MARIA LEIRIA DE CASTRO Camila
19545,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,PEREIRA Pamela


In [556]:
data_raw.rename(columns = {'new_country':'country', 'new_abb':'abb','new_athlete' : 'athlete'}, inplace = True)

In [557]:
data_raw

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete
0,Men,Double FITA Round Amputee,Gold,Archery,1980,Denmark,DEN,LARSEN Finn
1,Men,Double FITA Round Amputee,Silver,Archery,1980,FR Germany,FRG,BRENNE Manfred
2,Men,Double FITA Round Amputee,Bronze,Archery,1980,Japan,JPN,SATO Masao
3,Men,Double FITA Round Paraplegic,Gold,Archery,1980,FR Germany,FRG,GEISS H.
4,Men,Double FITA Round Paraplegic,Silver,Archery,1980,Belgium,BEL,GRUN Guy
...,...,...,...,...,...,...,...,...
19542,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE ALMEIDA SILVA Nurya
19543,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE OLIVEIRA DIAS Edwarda
19544,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,MARIA LEIRIA DE CASTRO Camila
19545,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,PEREIRA Pamela


In [558]:
data_raw.notnull().describe()

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete
count,19547,19547,19547,19547,19547,19547,19547,19547
unique,1,1,1,1,1,1,1,1
top,True,True,True,True,True,True,True,True
freq,19547,19547,19547,19547,19547,19547,19547,19547


In [559]:
data_raw['gender'].fillna('UNKNOWN', inplace=True)
data_raw['athlete'].fillna('UNKNOWN', inplace=True)
data_raw['abb'].fillna('UNKNOWN', inplace=True)
data_raw['country'].fillna('UNKNOWN', inplace=True)
data_raw

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete
0,Men,Double FITA Round Amputee,Gold,Archery,1980,Denmark,DEN,LARSEN Finn
1,Men,Double FITA Round Amputee,Silver,Archery,1980,FR Germany,FRG,BRENNE Manfred
2,Men,Double FITA Round Amputee,Bronze,Archery,1980,Japan,JPN,SATO Masao
3,Men,Double FITA Round Paraplegic,Gold,Archery,1980,FR Germany,FRG,GEISS H.
4,Men,Double FITA Round Paraplegic,Silver,Archery,1980,Belgium,BEL,GRUN Guy
...,...,...,...,...,...,...,...,...
19542,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE ALMEIDA SILVA Nurya
19543,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE OLIVEIRA DIAS Edwarda
19544,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,MARIA LEIRIA DE CASTRO Camila
19545,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,PEREIRA Pamela


In [560]:
gold_data = data_raw[data_raw['medal']=='Gold']
silver_data = data_raw[data_raw['medal']=='Silver']
bronze_data = data_raw[data_raw['medal']=='Bronze']

In [561]:
gold_data

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete
0,Men,Double FITA Round Amputee,Gold,Archery,1980,Denmark,DEN,LARSEN Finn
3,Men,Double FITA Round Paraplegic,Gold,Archery,1980,FR Germany,FRG,GEISS H.
6,Men,Double FITA Round Tetraplegic,Gold,Archery,1980,Canada,CAN,PARKER T.
9,Men,Double FITA Round Novice Paraplegic,Gold,Archery,1980,Mexico,MEX,CHAVEZ Alfredo
12,Men,Double FITA Round Novice Tetraplegic,Gold,Archery,1980,Finland,FIN,KARKAINEN K.
...,...,...,...,...,...,...,...,...
19518,Women,Sitting Volleyball,Gold,Volleyball,2016,United States of America,USA,EDWARDS Tia
19519,Women,Sitting Volleyball,Gold,Volleyball,2016,United States of America,USA,NIEVES Nicky
19520,Women,Sitting Volleyball,Gold,Volleyball,2016,United States of America,USA,SCHIFFLER Michelle
19521,Women,Sitting Volleyball,Gold,Volleyball,2016,United States of America,USA,SHIFFLETT Alexis


In [562]:
partitions = []
for medal in data_raw['medal'].unique():
    data = data_raw[data_raw['medal']==medal]
    partitions.append(data)
data_new = pd.concat(partitions)
data_new.notnull().describe()

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete
count,19547,19547,19547,19547,19547,19547,19547,19547
unique,1,1,1,1,1,1,1,1
top,True,True,True,True,True,True,True,True
freq,19547,19547,19547,19547,19547,19547,19547,19547


In [565]:
data_new

Unnamed: 0,gender,event,medal,type,year,country,abb,athlete
0,Men,Double FITA Round Amputee,Gold,Archery,1980,Denmark,DEN,LARSEN Finn
3,Men,Double FITA Round Paraplegic,Gold,Archery,1980,FR Germany,FRG,GEISS H.
6,Men,Double FITA Round Tetraplegic,Gold,Archery,1980,Canada,CAN,PARKER T.
9,Men,Double FITA Round Novice Paraplegic,Gold,Archery,1980,Mexico,MEX,CHAVEZ Alfredo
12,Men,Double FITA Round Novice Tetraplegic,Gold,Archery,1980,Finland,FIN,KARKAINEN K.
...,...,...,...,...,...,...,...,...
19542,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE ALMEIDA SILVA Nurya
19543,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,DE OLIVEIRA DIAS Edwarda
19544,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,MARIA LEIRIA DE CASTRO Camila
19545,Women,Sitting Volleyball,Bronze,Volleyball,2016,Brazil,BRA,PEREIRA Pamela


In [566]:
data_new.to_csv('data/athlete_cleaned.csv',index=False)

In [567]:
data_short = data_new.drop(columns=['athlete','event'])
data_short.to_csv('data/athlete_cleaned_short.csv',index=False)