In [1]:
import os, sys
sys.path.append(os.path.abspath(".."))

from functions import (
    load_config, read_csv_from_config,
    normalize_columns, clean_runners_pipeline,
    drop_empty_columns, fill_missing_with_unknown, standardize_text_column,
    print_shape
)

# Load config.yaml
config = load_config("../config.yaml")

# Load raw marathon dataset (adjust key name if needed)
df = read_csv_from_config(config, "marathon_data")

print_shape(df, "Raw data")
df.head()


Raw data shape: (884944, 5)


Unnamed: 0,YEAR,COUNTRY,GENDER,AGE,TIME
0,1974,,male,L1,02:44:53
1,1974,,male,L2,02:46:43
2,1974,,male,L2,02:48:08
3,1974,,male,L,02:48:40
4,1974,,male,L1,02:49:01


In [2]:
import pandas as pd

import yaml

with open("../config.yaml", "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

config




{'input_data': {'marathon_data': '../data/raw/Berlin_Marathon_data_1974_2019.csv',
  'marathon_winners': '../data/raw/berlin_marathon_winners_1974_2024.csv'},
 'output_data': {'cleaned_data': '../data/clean/cleaned_marathon.csv',
  'cleaned_winners': '../data/clean/cleaned_marathon_winners.csv'}}

In [3]:
df = pd.read_csv(config["input_data"]["marathon_data"], low_memory=False)
print("Shape:", df.shape)
df.head()

Shape: (884944, 5)


Unnamed: 0,YEAR,COUNTRY,GENDER,AGE,TIME
0,1974,,male,L1,02:44:53
1,1974,,male,L2,02:46:43
2,1974,,male,L2,02:48:08
3,1974,,male,L,02:48:40
4,1974,,male,L1,02:49:01


In [4]:
# Step 3 — Normalize column names to snake_case
before = df.columns.tolist()

df.columns = (
    df.columns
      .str.strip()                # remove leading/trailing spaces
      .str.lower()                # lower case
      .str.replace(' ', '_')      # spaces -> underscores
      .str.replace(r'[^a-z0-9_]', '', regex=True)  # drop weird chars
)

print("Before:", before)
print("After: ", df.columns.tolist())
df.head()


Before: ['YEAR', 'COUNTRY', 'GENDER', 'AGE', 'TIME']
After:  ['year', 'country', 'gender', 'age', 'time']


Unnamed: 0,year,country,gender,age,time
0,1974,,male,L1,02:44:53
1,1974,,male,L2,02:46:43
2,1974,,male,L2,02:48:08
3,1974,,male,L,02:48:40
4,1974,,male,L1,02:49:01


In [5]:
# Step 4 — Check missing values
nulls = df.isna().sum()
print("Missing values per column:\n", nulls)

nulls_percent = (df.isna().mean() * 100).round(2)
print("\nPercentage of missing values per column:\n", nulls_percent)


Missing values per column:
 year            0
country    854148
gender          0
age         12838
time            0
dtype: int64

Percentage of missing values per column:
 year        0.00
country    96.52
gender      0.00
age         1.45
time        0.00
dtype: float64


In [6]:
# Step 5 – Clean 'country' and drop 'age'

print("Columns BEFORE:", list(df.columns))

# 1) Drop 'country' if present
df = df.drop(columns=['country'], errors='ignore')
print("Column 'country' removed (if present) ")

# 2) Drop 'age' if present
df = df.drop(columns=['age'], errors='ignore')
print("Column 'age' removed (if present) ")

print("Columns AFTER:", list(df.columns))
df.head()



Columns BEFORE: ['year', 'country', 'gender', 'age', 'time']
Column 'country' removed (if present) 
Column 'age' removed (if present) 
Columns AFTER: ['year', 'gender', 'time']


Unnamed: 0,year,gender,time
0,1974,male,02:44:53
1,1974,male,02:46:43
2,1974,male,02:48:08
3,1974,male,02:48:40
4,1974,male,02:49:01


In [7]:
print("Columnas actuales:", list(df.columns))

Columnas actuales: ['year', 'gender', 'time']


In [None]:
# Step 7 — Clean 'GENDER'

if "GENDER" in df.columns:
    print("Unique values before:", df["GENDER"].unique())

    df["GENDER"] = (
        df["GENDER"].astype(str).str.lower().str.strip()
        .map({"m":"male", "male":"male", 
              "f":"female", "female":"female"})
        .fillna("unknown")
    )

    print("Unique values after:", df["GENDER"].unique())
    print(df["GENDER"].value_counts())


In [None]:
# Step 8 — Parse TIME to duration and seconds

# 1) elegir la columna de tiempo (respeta mayúsculas)
time_col = "TIME" if "TIME" in df.columns else [c for c in df.columns if "time" in c.lower()][0]
print("Usando columna de tiempo:", time_col)

# 2) convert to timedelta (NaT if value is bad)
df["finish_time"] = pd.to_timedelta(df[time_col], errors="coerce")

# 3) create seconds (float)
df["finish_seconds"] = df["finish_time"].dt.total_seconds()

# 4) quick verification
print("NaT en finish_time:", df["finish_time"].isna().sum())
df[[time_col, "finish_time", "finish_seconds"]].head()


In [None]:
# Save cleaned & wrangled dataset
output_path = config["output_data"]["cleaned_data"]
df.to_csv(output_path, index=False)

print("Dataset saved to:", output_path)
print("Final shape:", df.shape)


In [None]:
from pathlib import Path 
csv = Path("../data/clean/cleaned_marathon.csv")
print(csv)
print(csv.exists())

In [None]:
from pathlib import Path
csv = Path("../data/clean/cleaned_marathon.csv").resolve()
print(csv.as_posix())   # absolute path with forward slashes
