In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [6]:
file = 'Database Resources/model_dataset_raw.csv'
df = pd.read_csv(file)
df

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,4.641827e+07,25831.582305
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,5.061704e+07,27221.524051
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,3.585177e+07,43248.529909
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,3.554150e+06,1848.061804
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,4.595700e+06,37807.967276
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11226,265605954,Zurian Hechavarria,CUB,female,8/10/95,1.64,58.0,athletics,0,0,0,1.138956e+07,
11227,214461847,Zuzana Hejnova,CZE,female,12/19/86,1.73,63.0,athletics,0,0,0,1.055122e+07,17548.338213
11228,88361042,di Xiao,CHN,male,5/14/91,1.85,100.0,wrestling,0,0,0,1.371220e+09,8027.683810
11229,900065925,le Quoc Toan Tran,VIE,male,4/5/89,1.60,56.0,weightlifting,0,0,0,9.170380e+07,2111.138024


In [22]:
#drop null rows 
df.dropna(inplace=True)

In [23]:
#make a volume with dob values as datetime 
df["dob_converted"] = pd.to_datetime(df["dob"])
df["dob_converted"]

0       2069-10-17
1       1986-09-23
2       1992-05-27
3       1991-01-02
4       1990-11-26
           ...    
11225   1992-02-04
11227   1986-12-19
11228   1991-05-14
11229   1989-04-05
11230   1992-01-06
Name: dob_converted, Length: 10109, dtype: datetime64[ns]

In [24]:
#Use the dob values to calculate age using datetime. 
#There are some dob values that converted in the 2000s instead of the 1900s so they need to be converted 

from datetime import datetime, date
def age(born):
    born = born.date()
    today = datetime.strptime("05/08/2016", "%d/%m/%Y").date()
    
    #in came the dob gets converted 100 
    if born.year > 2016: 
        return today.year - (born.year - 100) - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
    else: 
        return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))

In [25]:
#calcated age column 
df["age"] = df["dob_converted"].apply(age)

df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita,dob_converted,age,total_medals
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,46418269.0,25831.582305,2069-10-17,46,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,50617045.0,27221.524051,1986-09-23,29,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,35851774.0,43248.529909,1992-05-27,24,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,3554150.0,1848.061804,1991-01-02,25,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,4595700.0,37807.967276,1990-11-26,25,0


In [28]:
#create the total medals column
df["total_medals"] = df["gold"]+df["silver"]+df["bronze"]

df["total_medals"]

0        0
1        0
2        1
3        0
4        0
        ..
11225    0
11227    0
11228    0
11229    0
11230    0
Name: total_medals, Length: 10109, dtype: int64

In [32]:
#quick check of the total medals column by sorting athletes by most medals 
df.sort_values("total_medals",ascending=False)

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita,dob_converted,age,total_medals
7212,491565031,Michael Phelps,USA,male,6/30/85,1.94,90.0,aquatics,5,1,0,321418820.0,56115.718426,1985-06-30,31,6
5399,960103057,Katie Ledecky,USA,female,3/17/97,1.83,72.0,aquatics,4,1,0,321418820.0,56115.718426,1997-03-17,19,5
9627,770111957,Simone Biles,USA,female,3/14/97,1.45,47.0,gymnastics,4,0,1,321418820.0,56115.718426,1997-03-14,19,5
6356,37020908,Madeline Dirado,USA,female,4/5/93,1.76,64.0,aquatics,2,1,1,321418820.0,56115.718426,1993-04-05,23,4
9634,973414226,Simone Manuel,USA,female,8/2/96,1.78,72.0,aquatics,2,2,0,321418820.0,56115.718426,1996-08-02,20,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4051,551778014,Hugo Gonzalez de Oliveira,ESP,male,2/19/99,1.92,82.0,aquatics,0,0,0,46418269.0,25831.582305,1999-02-19,17,0
4052,823618365,Hugo Houle,CAN,male,9/27/90,1.83,69.0,cycling,0,0,0,35851774.0,43248.529909,1990-09-27,25,0
4053,931957390,Hugo Inglis,NZL,male,1/18/91,1.78,74.0,hockey,0,0,0,4595700.0,37807.967276,1991-01-18,25,0
4054,365733362,Hugo Parisi,BRA,male,8/1/84,1.72,69.0,aquatics,0,0,0,207847528.0,8538.589975,1984-08-01,32,0


In [33]:
#function to create a binary value if an athlete won any medal or not 
def placed(medals):
    if medals > 0: 
        placed = 1
    else: 
        placed = 0
    return placed

In [34]:
#create a new column with "placed" binary output
df["placed"]=df["total_medals"].apply(placed)

In [35]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,population,gdp_per_capita,dob_converted,age,total_medals,placed
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,46418269.0,25831.582305,2069-10-17,46,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,50617045.0,27221.524051,1986-09-23,29,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,35851774.0,43248.529909,1992-05-27,24,1,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,3554150.0,1848.061804,1991-01-02,25,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,4595700.0,37807.967276,1990-11-26,25,0,0


In [41]:
#check the final values 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10109 entries, 0 to 11230
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              10109 non-null  int64         
 1   name            10109 non-null  object        
 2   nationality     10109 non-null  object        
 3   sex             10109 non-null  object        
 4   dob             10109 non-null  object        
 5   height          10109 non-null  float64       
 6   weight          10109 non-null  float64       
 7   sport           10109 non-null  object        
 8   gold            10109 non-null  int64         
 9   silver          10109 non-null  int64         
 10  bronze          10109 non-null  int64         
 11  population      10109 non-null  float64       
 12  gdp_per_capita  10109 non-null  float64       
 13  dob_converted   10109 non-null  datetime64[ns]
 14  age             10109 non-null  int64         
 15  to

In [43]:
df.to_csv("model_dataset_cleaned.csv",index=False)