# **Clean and Prep ADA Race Results**

Run on Python 3.12 | No errors | No warnings

In [16]:
# Import packages

# For data manipulation
import pandas as pd

# for displaying and modifying the working directory
import os as os

In [17]:
# Define custom NA values, excluding 'NA' to make sure that Country Code NA (Namibia) is not interpreted as missing data
custom_na_values = ['N/A', 'NaN', 'null', '']

In [18]:
# Tell python where to find the dataset and load it to dataframe df0
os.chdir(r'D:\OneDrive\Documents\Python\Current\Race Results') # absolute path, using \ and r prefix

#Load specific columns, ignore the default NA values and use the custom ones
df0 = pd.read_csv("ADA_race_results.csv", usecols=['Name', 'Country', 'Time', 'Age Group', 'Enrollment'], keep_default_na=False, na_values=custom_na_values)

# Display the first 5 rows of the dataframe
df0.head()

Unnamed: 0,Name,Country,Time,Age Group,Enrollment
0,Youssouf Mahamat Allamine Tahir,TD,00:14:45,30 - 34,Run / Walk
1,Jhan Carlos,CO,00:14:45,20 - 24,Run / Walk
2,Nyasha Dzivai,ZA,00:14:45,35 - 39,Run / Walk
3,Chadwick Pridgen,US,00:14:45,40 - 44,Run / Walk
4,Neftali Pérez,VE,00:14:45,50 - 54,Run / Walk


In [19]:
# Display basic information about the data 
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5743 entries, 0 to 5742
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5743 non-null   object
 1   Country     5743 non-null   object
 2   Time        5743 non-null   object
 3   Age Group   5743 non-null   object
 4   Enrollment  5743 non-null   object
dtypes: object(5)
memory usage: 224.5+ KB


In [20]:
# Display the number of missing values in each column
df0.isnull().sum()

Name          0
Country       0
Time          0
Age Group     0
Enrollment    0
dtype: int64

In [21]:
# Count the number of duplicate rows
df0.duplicated().sum()

358

In [22]:
# Drop duplicate rows
df0 = df0.drop_duplicates()

In [23]:
# Verify that the duplicates have been removed
df0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5385 entries, 0 to 5741
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5385 non-null   object
 1   Country     5385 non-null   object
 2   Time        5385 non-null   object
 3   Age Group   5385 non-null   object
 4   Enrollment  5385 non-null   object
dtypes: object(5)
memory usage: 252.4+ KB


In [24]:
# Drop the Name column
df0 = df0.drop(columns=['Name'])

In [25]:
# Add a column for the year
df0['Year'] = 2024

In [26]:
# Add a column for the race name
df0['Race'] = 'ADA'

In [27]:
# Verify the changes
df0.head()

Unnamed: 0,Country,Time,Age Group,Enrollment,Year,Race
0,TD,00:14:45,30 - 34,Run / Walk,2024,ADA
1,CO,00:14:45,20 - 24,Run / Walk,2024,ADA
2,ZA,00:14:45,35 - 39,Run / Walk,2024,ADA
3,US,00:14:45,40 - 44,Run / Walk,2024,ADA
4,VE,00:14:45,50 - 54,Run / Walk,2024,ADA


In [28]:
# Rename Country to Country Code
df0 = df0.rename(columns={'Country': 'Country Code'})

In [29]:
# Change the column order
df0 = df0[['Race', 'Year', 'Country Code', 'Age Group', 'Time', 'Enrollment']]

In [30]:
# Verify the changes
df0.head()

Unnamed: 0,Race,Year,Country Code,Age Group,Time,Enrollment
0,ADA,2024,TD,30 - 34,00:14:45,Run / Walk
1,ADA,2024,CO,20 - 24,00:14:45,Run / Walk
2,ADA,2024,ZA,35 - 39,00:14:45,Run / Walk
3,ADA,2024,US,40 - 44,00:14:45,Run / Walk
4,ADA,2024,VE,50 - 54,00:14:45,Run / Walk


In [31]:
# Save the cleaned data to a new csv file
df0.to_csv('ADA Race Data for Visualization.csv', index=False)