In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet


In [2]:
olympic_df = pd.read_csv('Olympic_Swimming_Results_1912to2020.csv')
olympic_df

Unnamed: 0,Location,Year,Distance (in meters),Stroke,Relay?,Gender,Team,Athlete,Results,Rank
0,Tokyo,2020,100m,Backstroke,0,Men,ROC,Evgeny Rylov,51.98,1
1,Tokyo,2020,100m,Backstroke,0,Men,ROC,Kliment Kolesnikov,52,2
2,Tokyo,2020,100m,Backstroke,0,Men,USA,Ryan Murphy,52.19,3
3,Tokyo,2020,100m,Backstroke,0,Men,ITA,Thomas Ceccon,52.3,4
4,Tokyo,2020,100m,Backstroke,0,Men,CHN,Jiayu Xu,52.51,4
...,...,...,...,...,...,...,...,...,...,...
4354,Stockholm,1912,4x100,Freestyle,1,Women,SWE,"Greta Carlsson, Vera Thulin, Sonja Johnsson, G...",,5
4355,Stockholm,1912,4x200,Freestyle,1,Men,AUS,"Malcolm Champion, Cecil Healy, Harold H. Hardw...",00:10:11.200000,1
4356,Stockholm,1912,4x200,Freestyle,1,Men,USA,"Duke Paoa Kahanamoku, Harry J. Hebner, Perry M...",00:10:20.200000,2
4357,Stockholm,1912,4x200,Freestyle,1,Men,GBR,"Thomas Sidney Battersby, Henry Taylor, John Ga...",00:10:28.600000,3


In [3]:
olympic_df['Team'].value_counts().head(20)

Team
USA    896
AUS    451
GBR    289
JPN    249
GER    227
CAN    199
HUN    173
GDR    144
SWE    143
FRA    139
NED    137
URS    136
ITA    126
CHN    121
RUS     77
FRG     76
BRA     61
DEN     56
RSA     51
ROU     43
Name: count, dtype: int64

In [4]:
olympic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4359 entries, 0 to 4358
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Location              4359 non-null   object
 1   Year                  4359 non-null   int64 
 2   Distance (in meters)  4359 non-null   object
 3   Stroke                4359 non-null   object
 4   Relay?                4359 non-null   int64 
 5   Gender                4359 non-null   object
 6   Team                  4359 non-null   object
 7   Athlete               4345 non-null   object
 8   Results               4331 non-null   object
 9   Rank                  4359 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 340.7+ KB


In [5]:

olympic_df.describe()

Unnamed: 0,Year,Relay?,Rank
count,4359.0,4359.0,4359.0
mean,1982.936453,0.169764,3.164946
std,26.928344,0.375468,1.189715
min,1912.0,0.0,0.0
25%,1968.0,0.0,2.0
50%,1988.0,0.0,4.0
75%,2004.0,0.0,4.0
max,2020.0,1.0,5.0


In [6]:
olympic_df.columns

Index(['Location', 'Year', 'Distance (in meters)', 'Stroke', 'Relay?',
       'Gender', 'Team', 'Athlete', 'Results', 'Rank'],
      dtype='object')

In [7]:

olympic_df.shape

(4359, 10)

In [8]:
olympic_df.isnull().sum()

Location                 0
Year                     0
Distance (in meters)     0
Stroke                   0
Relay?                   0
Gender                   0
Team                     0
Athlete                 14
Results                 28
Rank                     0
dtype: int64

In [9]:
#Prepare Data for Analysis

In [10]:
# Step 1: Handle missing values
# Drop rows where 'Athlete' or 'Results' are missing, as these are likely important for analysis.
df_cleaned = olympic_df.dropna(subset=['Athlete', 'Results'])

# Step 2: Handle 'Distance (in meters)' conversion
# Create a new column to mark whether the event is a relay, based on the 'Distance (in meters)' column.
df_cleaned['Is Relay'] = df_cleaned['Distance (in meters)'].str.contains('x').astype(int)

# Convert non-relay distances to integers by removing 'm' and converting to int
df_cleaned.loc[df_cleaned['Is Relay'] == 0, 'Distance (in meters)'] = (
    df_cleaned.loc[df_cleaned['Is Relay'] == 0, 'Distance (in meters)']
    .str.replace('m', '')
    .astype(int)
)

# Step 3: Convert 'Results' to a numeric type
# Attempt to convert 'Results' to float, handling errors where the result might not be numeric
df_cleaned['Results'] = pd.to_numeric(df_cleaned['Results'], errors='coerce')

# Drop rows where 'Results' couldn't be converted to a numeric value
df_cleaned = df_cleaned.dropna(subset=['Results'])

# Step 4: Standardize 'Stroke' and 'Team' columns to ensure consistency
df_cleaned['Stroke'] = df_cleaned['Stroke'].str.strip().str.title()
df_cleaned['Team'] = df_cleaned['Team'].str.strip().str.upper()

# Display the cleaned data to ensure correctness
df_cleaned_info = df_cleaned.info()
df_cleaned_head = df_cleaned.head()

print(df_cleaned_info)
print(df_cleaned_head)

<class 'pandas.core.frame.DataFrame'>
Index: 750 entries, 0 to 4174
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Location              750 non-null    object 
 1   Year                  750 non-null    int64  
 2   Distance (in meters)  750 non-null    object 
 3   Stroke                750 non-null    object 
 4   Relay?                750 non-null    int64  
 5   Gender                750 non-null    object 
 6   Team                  750 non-null    object 
 7   Athlete               750 non-null    object 
 8   Results               750 non-null    float64
 9   Rank                  750 non-null    int64  
 10  Is Relay              750 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 70.3+ KB
None
  Location  Year Distance (in meters)      Stroke  Relay? Gender Team  \
0    Tokyo  2020                  100  Backstroke       0    Men  ROC   
1    Tokyo  2020      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Is Relay'] = df_cleaned['Distance (in meters)'].str.contains('x').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Results'] = pd.to_numeric(df_cleaned['Results'], errors='coerce')
