In [1]:
import pandas as pd
import os

pd.set_option('display.max_columns', 500)

data_path = os.getcwd()[:-len('notebooks')] + 'Data/'
db_credit_card_fraud_detection_small = data_path + 'Credit_card_fraud_detection_small/'
db_beijing_path = data_path + 'Beijing_winter_Olympic_Games/'
transactional_data = data_path + 'Transactional_data/'

In [2]:
os.listdir(transactional_data)

['.DS_Store',
 'goat of tennis.csv',
 'Highest Holywood Grossing Movies.csv',
 'trains vs planes.csv',
 'Loan_Default.csv',
 'New York Citibike Trips.csv',
 'Business Sales Transaction.csv']

# Beijing

In [3]:
os.listdir(db_beijing_path)

['hockey_results.csv',
 'curling_results.csv',
 'athletes.csv',
 'technical_officials.csv',
 'medals_total.csv',
 'coaches.csv',
 'entries_discipline.csv',
 'hockey_players_stats.csv',
 'medals.csv',
 'events.csv']

# Ejercicio Práctica sábado:

Como comité olímpico, nos solicitan analizar la relación que existe entre el número de participantes enviados por cada país y el número de medallas obtenidos en los tres niveles. 

Nos interesa analizar este proceso a nivel país, y nos gustaría poder revisar la cantidad de atletas enviados por cada país (cantidad total, cantidad de mujeres y cantidad de hombres), total de medallas obtenidas, total de medallas de oro, total de medallas de plata y total de medallas de bronce (segmentados por hombres y mujeres). Además de esto, nos interesa contar la cantidad de coaches que envió cada país, de igual forma, divido por mujeres y hombres. Y por último agregar las columnas para los referies, la intención es analizar si la cantidad de personas enviadas por cada país está relacionada con la cantidad de medallas traidas por cada país. 

In [4]:
athletes = pd.read_csv(db_beijing_path + 'athletes.csv')
officials = pd.read_csv(db_beijing_path + 'technical_officials.csv')
coaches = pd.read_csv(db_beijing_path + 'coaches.csv')
medals = pd.read_csv(db_beijing_path + 'medals.csv')

In [5]:
# Creamos la lista de todos los países
countries = list(set(list(athletes['country'].unique()) 
                     + list(officials['country'].unique())
                     + list(coaches['country'].unique())))
countries = pd.Series(countries).sort_values()
countries

67                     Albania
77              American Samoa
72                     Andorra
61                   Argentina
81                     Armenia
                ...           
33                      Turkey
37                     Ukraine
38    United States of America
30                  Uzbekistan
15          Virgin Islands, US
Length: 92, dtype: object

In [6]:
# Revisamos que la variable tenga valores únicos
athletes['gender'].unique()

array(['Male', 'Female', 'F', 'M'], dtype=object)

In [7]:
# Normalizamos la variable
aux_athletes_dict = {
    'Male' : 'Male',
    'Female': 'Female',
    'F': 'Female',
    'M': 'Male'
}
athletes['gender'] = athletes['gender'].map(aux_athletes_dict)
athletes['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [8]:
# Revisamos que la variable tenga valores únicos
officials['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [9]:
# Revisamos que la variable tenga valores únicos
coaches['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [10]:
# Revisamos que la variable tenga valores únicos
medals['athlete_sex'].unique()

array(['X', 'W', 'M', 'O'], dtype=object)

Para esta situación en particular, tendremos que idear una alternativa, para poder obtener el sexo del atleta. Para esto utilizaremos diccionarios, los cuales crearemos a partir del dataframe de atletas.

In [11]:
medals_ = medals.copy()

In [12]:
dictionaries = dict()
vars_ = ['name', 'gender', 'country'] # Variables de interés
vars_f = ['name', 'gender'] # Llave y valor para el diccionario
f_var = 'country' # Variable de filtrado
for c in countries:
    dictionaries.update(dict(athletes[vars_].loc[athletes[f_var] == c][vars_f].values))

medals_['gender'] = medals_['athlete_name'].map(dictionaries)
medals_['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [13]:
# 1. Athletes
aux_athletes = (athletes[['country', 'gender', 'name']]
                .drop_duplicates()
                .pivot_table(index = 'country', columns = 'gender', aggfunc = 'count', fill_value = 0, 
                             values = 'name'))
aux_athletes.columns = [c.lower() + '_athletes' for c in aux_athletes.columns]
# 2. Officials
aux_officials = (officials[['country', 'gender', 'name']]
                 .drop_duplicates()
                 .pivot_table(index = 'country', columns = 'gender', aggfunc = 'count', fill_value = 0,
                            values = 'name'))
aux_officials.columns = [c.lower() + '_officials' for c in aux_officials.columns]
# 3. Coaches
aux_coaches = (coaches[['country', 'gender', 'name']]
               .drop_duplicates()
               .pivot_table(index = 'country', columns = 'gender', aggfunc = 'count', fill_value = 0,
                            values = 'name'))
aux_coaches.columns = [c.lower() + '_coaches' for c in aux_coaches.columns]
# 4. Medals
aux_medals = (medals_[['country', 'gender', 'athlete_name']]
                .pivot_table(index = 'country', columns = 'gender', aggfunc = 'count', fill_value = 0,
                            values = 'athlete_name'))
aux_medals.columns = [c.lower() + '_total_medals' for c in aux_medals.columns]
# 5. Female Medals
aux_female_medals = (medals_[['country', 'gender', 'medal_type']].loc[medals_['gender'] == 'Female']
                     .pivot_table(index = 'country', columns = 'medal_type', aggfunc = 'count',
                                  values = 'gender', fill_value = 0))
aux_female_medals.columns = [f'female_{c.lower()}_medals' for c in aux_female_medals.columns]
# 6. Male Medals
aux_male_medals = (medals_[['country', 'gender', 'medal_type']].loc[medals_['gender'] == 'Male']
                     .pivot_table(index = 'country', columns = 'medal_type', aggfunc = 'count',
                                  values = 'gender', fill_value = 0))
aux_male_medals.columns = [f'male_{c.lower()}_medals' for c in aux_male_medals.columns]

In [14]:
aux_total_athletes = pd.DataFrame(athletes[['country', 'name']].drop_duplicates().groupby('country').count()['name'])
aux_total_athletes.columns = ['athletes']

In [15]:
df = pd.DataFrame(index = countries)
df['country'] = df.index
df.reset_index(inplace = True, drop = True)

df = (df.merge(aux_athletes, how = 'left', left_on = 'country', right_on = 'country', suffixes=('', '_athletes'))
      .merge(aux_officials, how = 'left', left_on = 'country', right_on = 'country', suffixes=('', '_officials'))
      .merge(aux_coaches, how = 'left', left_on = 'country', right_on = 'country', suffixes=('', '_coaches'))
      .merge(aux_medals, how = 'left', left_on = 'country', right_on = 'country', suffixes=('', '_medals'))
      .merge(aux_female_medals, how = 'left', left_on = 'country', right_index = True, suffixes=('', '_medals'))
      .merge(aux_male_medals, how = 'left', left_on = 'country', right_index = True, suffixes=('', '_medals'))
      .merge(aux_total_athletes, how = 'left', left_on = 'country', right_index = True, suffixes = ('', '_tathletes'))
      .fillna(0))

In [16]:
columns = ['Cantidad total de atletas: athletes',
'Cantidad total de atletas hombres enviada: male_athletes',
'Cantidad total de atletas mujeres enviada: female_athletes',
'Cantidad total de coaches hombres enviada: male_coaches',
'Cantidad total de coaches mujeres enviada: female_coaches',
'Cantidad total de oficiales hombres enviada: male_officials',
'Cantidad total de oficiales mujeres enviada: female_officials',
'Cantidad total de medallas obtenidas por hombres: male_total_medals',
'Cantidad total de medallas obtenidas por mujeres: female_total_medals',
'Cantidad total de medallas de oro obtenidas por hombres: male_gold_medals',
'Cantidad total de medallas de oro obtenidas por mujeres: female_gold_medals',
'Cantidad total de medallas de plata obtenidas por hombres: male_silver_medals',
'Cantidad total de medallas de plata obtenidas por mujeres: female_silver_medals',
'Cantidad total de medallas de bronce obtenidas por hombres: male_bronze_medals',
'Cantidad total de medallas de bronce obtenidas por mujeres: female_bronze_medals']

columns = ['country'] + [c.split(':')[1].strip() for c in columns]

In [17]:
df[columns]

Unnamed: 0,country,athletes,male_athletes,female_athletes,male_coaches,female_coaches,male_officials,female_officials,male_total_medals,female_total_medals,male_gold_medals,female_gold_medals,male_silver_medals,female_silver_medals,male_bronze_medals,female_bronze_medals
0,Albania,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,American Samoa,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Andorra,5.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Argentina,6.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Armenia,6.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,Turkey,7.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,Ukraine,46.0,24.0,22.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
89,United States of America,225.0,117.0,108.0,3.0,1.0,0.0,3.0,20.0,43.0,5.0,6.0,9.0,31.0,6.0,6.0
90,Uzbekistan,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


df[columns].to_clipboard()

df[columns].to_csv('Beijing_Oviedo_Quezada_Rolando.csv', index = False)