# Data Cleaning

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('../csvs/season/all_team_standings.csv')
df

Unnamed: 0,Season,Team ID,Team Name,Owner(s),Wins,Losses,Points For,Points Against,Regular Season Rank,Final Rank
0,2012,1,Percy Whipped,Samuel Remler,7,6,1433.74,1363.62,3,1
1,2012,4,Forgetting BrandonMarshall,Jesse Hynes,7,6,1347.80,1281.94,4,2
2,2012,2,The Dawgs,"Julian Bombard, julian bombard",8,5,1263.98,1161.24,2,3
3,2012,7,Joe Buck Yourself,Hunter Rieger,8,5,1308.00,1189.54,1,4
4,2012,6,The Madd Prater,"kellen dreyer, Parker Denison",6,7,1234.48,1294.80,8,5
...,...,...,...,...,...,...,...,...,...,...
147,2024,3,Fergayson or th0t daughter,"daniel kvassov, Jake Price",7,7,1410.14,1395.26,7,8
148,2024,10,Rectum Wrecker$$$,"K S, Alex Rowland",4,10,1068.44,1342.72,12,9
149,2024,7,Kamara Harris,"Isaac Rothenberg, Keyan Rahim",5,9,1320.60,1338.20,10,10
150,2024,4,ham in a tube,Jesse Hynes,5,9,1276.60,1431.32,11,11


### Step 1: Normalize Team Owner Names

Map correct owner names for each season

In [3]:
import pandas as pd

def expand_owner_ranges(rows):
    out = []
    for r in rows:
        for yr in range(r["Start"], r["End"] + 1):
            out.append({"Season": yr, "Team ID": r["Team ID"], "Owner(s)": r["Owner(s)"]})
    return pd.DataFrame(out).sort_values(["Team ID", "Season"]).reset_index(drop=True)

owner_ranges = [
    # Team 1
    {"Team ID": 1,  "Start": 2012, "End": 2024, "Owner(s)": "Samuel Remler"},
    # Team 2
    {"Team ID": 2,  "Start": 2012, "End": 2024, "Owner(s)": "Julian Bombard"},
    # Team 3
    {"Team ID": 3,  "Start": 2012, "End": 2012, "Owner(s)": "Daniel Kvassov"},
    {"Team ID": 3,  "Start": 2013, "End": 2017, "Owner(s)": "Daniel Kvassov, Aidan Donahue"},
    {"Team ID": 3,  "Start": 2018, "End": 2024, "Owner(s)": "Daniel Kvassov, Jake Price"},
    # Team 4
    {"Team ID": 4,  "Start": 2012, "End": 2024, "Owner(s)": "Jesse Hynes"},
    # Team 5
    {"Team ID": 5,  "Start": 2012, "End": 2012, "Owner(s)": "Aidan Donahue"},
    {"Team ID": 5,  "Start": 2013, "End": 2024, "Owner(s)": "Yassine Hamdouni"},
    # Team 6
    {"Team ID": 6,  "Start": 2012, "End": 2024, "Owner(s)": "Kellen Dreyer, Tal Litwin"},
    # Team 7
    {"Team ID": 7,  "Start": 2012, "End": 2014, "Owner(s)": "Hunter Rieger"},
    {"Team ID": 7,  "Start": 2015, "End": 2017, "Owner(s)": "Hunter Rieger, Jake Price"},
    {"Team ID": 7,  "Start": 2018, "End": 2023, "Owner(s)": "Keyan Rahim"},
    {"Team ID": 7,  "Start": 2024, "End": 2024, "Owner(s)": "Keyan Rahim, Isaac Rothenberg"},
    # Team 8
    {"Team ID": 8,  "Start": 2012, "End": 2024, "Owner(s)": "Jacob Maler, Nathan Zicherman"},
    # Team 9
    {"Team ID": 9,  "Start": 2012, "End": 2024, "Owner(s)": "Lorenzo Siemann"},
    # Team 10
    {"Team ID": 10, "Start": 2012, "End": 2024, "Owner(s)": "Alex Rowland"},
    # Team 11
    {"Team ID": 11, "Start": 2012, "End": 2024, "Owner(s)": "Reuben Goldberg"},
    # Team 12
    {"Team ID": 12, "Start": 2014, "End": 2017, "Owner(s)": "Liam Pauley"},
    {"Team ID": 12, "Start": 2018, "End": 2024, "Owner(s)": "Jasper Hebert, Joe Kahn"},
]

owners_map = expand_owner_ranges(owner_ranges)
owners_map.head(), owners_map.tail()


(   Season  Team ID       Owner(s)
 0    2012        1  Samuel Remler
 1    2013        1  Samuel Remler
 2    2014        1  Samuel Remler
 3    2015        1  Samuel Remler
 4    2016        1  Samuel Remler,
      Season  Team ID                 Owner(s)
 149    2020       12  Jasper Hebert, Joe Kahn
 150    2021       12  Jasper Hebert, Joe Kahn
 151    2022       12  Jasper Hebert, Joe Kahn
 152    2023       12  Jasper Hebert, Joe Kahn
 153    2024       12  Jasper Hebert, Joe Kahn)

Apply mapped owners to standings and save cleaned file 

In [4]:
standings_clean = df.merge(
    owners_map,
    on=["Season", "Team ID"],
    how="left",
    suffixes=("", "_mapped")
)

standings_clean["Owner(s)_Clean"] = standings_clean["Owner(s)_mapped"].fillna(standings_clean["Owner(s)"])

cols_order = [
    "Season","Team ID","Team Name","Owner(s)_Clean","Wins","Losses",
    "Points For","Points Against","Regular Season Rank","Final Rank"
]
standings_clean = standings_clean.reindex(columns=cols_order)

standings_clean.to_csv("../csvs/season/all_team_standings_cleaned.csv", index=False)
print("✅ Saved: ../csvs/season/all_team_standings_cleaned.csv")


✅ Saved: ../csvs/season/all_team_standings_cleaned.csv


In [7]:
cleaned = pd.read_csv('../csvs/season/all_team_standings_cleaned.csv')
cleaned.sample(25)

Unnamed: 0,Season,Team ID,Team Name,Owner(s)_Clean,Wins,Losses,Points For,Points Against,Regular Season Rank,Final Rank
94,2020,5,The Chark Knight Rises,Yassine Hamdouni,8,5,1258.96,1201.0,3,3
13,2013,5,La Migra,Yassine Hamdouni,8,5,1259.3,1191.48,4,4
147,2024,3,Fergayson or th0t daughter,"Daniel Kvassov, Jake Price",7,7,1410.14,1395.26,7,8
9,2012,3,somewhere over the Dwayne Bowe,Daniel Kvassov,4,9,1099.3,1339.86,10,10
39,2015,3,Fuck Salt,"Daniel Kvassov, Aidan Donahue",5,8,1168.42,1217.1,9,8
51,2016,7,Last Place,"Hunter Rieger, Jake Price",6,7,1240.72,1226.24,8,8
99,2020,8,Thielen under The weather,"Jacob Maler, Nathan Zicherman",6,7,1224.54,1248.1,8,8
113,2021,1,Mr. Commish Sir,Samuel Remler,4,10,1272.98,1388.34,10,10
128,2023,12,Its literally too easy,"Jasper Hebert, Joe Kahn",9,5,1375.14,1240.6,3,1
88,2019,12,Its literally too easy,"Jasper Hebert, Joe Kahn",5,8,1175.52,1197.52,9,9
