In [1]:
import os, sys
sys.path.append(os.path.abspath(".."))

from src.functions import (
    load_config, read_csv_from_config,
    normalize_columns, clean_runners_pipeline,
    drop_empty_columns, fill_missing_with_unknown, standardize_text_column,
    print_shape
)

# Load config.yaml
config = load_config("../config.yaml")

# Load raw marathon dataset (adjust key name if needed)
df = read_csv_from_config(config, "marathon_data")

print_shape(df, "Raw data")
df.head()


Raw data shape: (884944, 5)


Unnamed: 0,YEAR,COUNTRY,GENDER,AGE,TIME
0,1974,,male,L1,02:44:53
1,1974,,male,L2,02:46:43
2,1974,,male,L2,02:48:08
3,1974,,male,L,02:48:40
4,1974,,male,L1,02:49:01


In [3]:
import pandas as pd
import yaml

# 1) Load config.yaml (mismo folder que el notebook)
with open("../config.yaml", "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

# 2) Load winners dataset
winners_df = pd.read_csv(config["input_data"]["marathon_winners"], low_memory=False)

print("Shape:", winners_df.shape)
winners_df.head()


Shape: (50, 7)


Unnamed: 0,YEAR,MEN_WINNER,MEN_COUNTRY,MEN_TIME,WOMEN_WINNER,WOMEN_COUNTRY,WOMEN_TIME
0,1974,Günter Hallas,West Germany,2:44:53,Jutta von Haase,West Germany,3:22:01
1,1975,Ralf Bochröder,West Germany,2:47:08,Kristin Bochröder,West Germany,2:59:15
2,1976,Ingo Sensburg,West Germany,2:23:08,Jutta von Haase,West Germany,3:05:19
3,1977,Günter Mielke,West Germany,2:15:19,Christa Vahlensieck,West Germany,2:34:48
4,1978,Michael Spöttel,West Germany,2:20:03,Ursula Blaschke,West Germany,2:57:09


In [4]:
print(winners_df.columns.tolist())     # columnas
print(winners_df.isna().sum())         # nulos por columna
print("duplicates:", winners_df.duplicated().sum())  # duplicados totales


['YEAR', 'MEN_WINNER', 'MEN_COUNTRY', 'MEN_TIME', 'WOMEN_WINNER', 'WOMEN_COUNTRY', 'WOMEN_TIME']
YEAR             0
MEN_WINNER       0
MEN_COUNTRY      0
MEN_TIME         0
WOMEN_WINNER     0
WOMEN_COUNTRY    0
WOMEN_TIME       0
dtype: int64
duplicates: 0


In [5]:
# Standardize column names
winners_df.columns = (
    winners_df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
)

print("Columns after standardization:", winners_df.columns.tolist())
winners_df.head()


Columns after standardization: ['year', 'men_winner', 'men_country', 'men_time', 'women_winner', 'women_country', 'women_time']


Unnamed: 0,year,men_winner,men_country,men_time,women_winner,women_country,women_time
0,1974,Günter Hallas,West Germany,2:44:53,Jutta von Haase,West Germany,3:22:01
1,1975,Ralf Bochröder,West Germany,2:47:08,Kristin Bochröder,West Germany,2:59:15
2,1976,Ingo Sensburg,West Germany,2:23:08,Jutta von Haase,West Germany,3:05:19
3,1977,Günter Mielke,West Germany,2:15:19,Christa Vahlensieck,West Germany,2:34:48
4,1978,Michael Spöttel,West Germany,2:20:03,Ursula Blaschke,West Germany,2:57:09


In [6]:
# Step 2 (fixed) — Reshape winners data to long format

men = winners_df[["year", "men_winner", "men_country", "men_time"]].copy()
men = men.rename(columns={
    "men_winner": "winner",
    "men_country": "country",
    "men_time": "time"
})
men["gender"] = "male"

women = winners_df[["year", "women_winner", "women_country", "women_time"]].copy()
women = women.rename(columns={
    "women_winner": "winner",
    "women_country": "country",
    "women_time": "time"
})
women["gender"] = "female"

# Combine both
winners_clean = pd.concat([men, women], ignore_index=True)

print("Shape:", winners_clean.shape)
winners_clean.head(10)


Shape: (100, 5)


Unnamed: 0,year,winner,country,time,gender
0,1974,Günter Hallas,West Germany,2:44:53,male
1,1975,Ralf Bochröder,West Germany,2:47:08,male
2,1976,Ingo Sensburg,West Germany,2:23:08,male
3,1977,Günter Mielke,West Germany,2:15:19,male
4,1978,Michael Spöttel,West Germany,2:20:03,male
5,1979,Ingo Sensburg,West Germany,2:21:09,male
6,1980,Ingo Sensburg,West Germany,2:16:48,male
7,1981,Ian Ray,United Kingdom,2:15:42,male
8,1982,Domingo Tibaduiza,Colombia,2:14:47,male
9,1983,Karel Lismont,Belgium,2:13:37,male


In [7]:
# Save cleaned winners dataset
output_path = config["output_data"]["cleaned_winners"]
winners_clean.to_csv(output_path, index=False)

print("Winners dataset saved to:", output_path)


Winners dataset saved to: ../data/clean/cleaned_marathon_winners.csv
