# We're going to show how much each athlete in the Big Ten improves over the course of their career.

## Let's structure how we're going to do this:

### First, we need to define how we want to structure the project. 
### Let's take every athlete's freshman year and compare that to the best year out of their career.

### First let's load pandas and our data.

In [1]:
import pandas as pd

# Initialize empty dictionaries to store the DataFrames
dataframes_lists = {}
dataframes_champs = {}

# Define the range of years
years = range(2010, 2025)

# Read CSV files into dataframes_lists
for year in years:
    if year != 2020:
        filename = f"dataframes_lists_{year}.csv"
        dataframes_lists[year] = pd.read_csv(filename)

# Read CSV files into dataframes_champs
for year in years:
    if year != 2020:
        filename = f"dataframes_champs_{year}.csv"
        dataframes_champs[year] = pd.read_csv(filename)


In [4]:
print(dataframes_lists[2019])
print(f"===========================================================================")
print(dataframes_champs[2019])

              Athlete  Year            Team  Mark
0         Adam Coulon  JR-3         Indiana  5.52
1    Rashid Coulibaly  JR-3       Wisconsin  5.33
2      Brock Mammoser  SO-2         Indiana  5.31
3      Tyler Loontjer  SR-4        Nebraska  5.30
4   Trevor Stephenson  FR-1  Michigan State  5.22
5       Robert Oswald  JR-3      Ohio State  5.22
6         Kevin Cahoy  JR-3        Nebraska  5.21
7     Garrison Hughes  FR-1        Nebraska  5.16
8      Spencer Powell  JR-3        Nebraska  5.15
9       Jacob LaRocca  JR-3        Illinois  5.11
10        Mike Herauf  FR-1       Minnesota  5.05
11       Peyton Haack  FR-1            Iowa  5.00
12       John Uchytil  JR-3       Minnesota  4.91
13       Zach Podraza  SO-2        Nebraska  4.90
14         Jack Pompe  FR-1        Illinois  4.81
15      Alex Shinnerl  SO-2         Indiana  4.75
16          Jack Lint  SR-4        Michigan  4.71
17      Nick Guerrant  JR-3  Michigan State  4.70
18      Seth Kricheff  FR-1          Purdue  4.70


### We will only consider athletes that meet the following criteria: 
### 1) Had their freshman year (FR-1) in the dataframe. 
### 2) Have competed multiple years.

Note that athletes who have at least two years complete but have not finished their careers will be considered. 


In [5]:
# Initialize a dictionary to count the appearances of each athlete
athlete_years = {}
list_freshmen = [] 

# Iterate through each year's DataFrame in dataframes_lists
for year, df in dataframes_lists.items():
    # Filter athletes who had their freshman year (FR-1) in this year
    freshmen = df[df['Year'] == 'FR-1']
    # Add the athletes to the list of freshmen
    list_freshmen += freshmen['Athlete'].tolist()



for dataframe in dataframes_lists.values():
    for athlete in list_freshmen:
        if athlete in dataframe['Athlete'].values:
            athlete_years[athlete] = athlete_years.get(athlete, 0) + 1

# Filter out athletes who have competed in multiple years
working_athletes = [athlete for athlete, count in athlete_years.items() if count > 1]
print(working_athletes)

['Kyle Campbell', 'Mitch Erickson', 'Derik Peterman', 'Sam Retzloff', 'Mitch Mammoser ', 'Christian Sanderfer', 'Steven Cahoy', 'Jesse Johnson', 'Tim Ehrhardt', 'Jed Fenske', 'Glen Harold', 'Noah Gary', 'Cole Gorski', 'Tyler Tappe', 'Tim Guthrie', 'Michael Hovater', 'Tyler Loontjer', 'Andy Jatis', 'Rashid Coulibaly', 'Jacob LaRocca', 'Spencer Powell', 'Kevin Cahoy', 'Adam Coulon', 'Robert Oswald', 'Cooper Jazo', 'Brock Mammoser', 'Alex Shinnerl', 'Trevor Stephenson', 'Garrison Hughes', 'Mike Herauf', 'Peyton Haack', 'Nathan Stone', 'Luke Knipe', 'Henry Sheldon', 'Tyler Sierks', 'Mason Mahacek', 'Nico Morales', 'Tyler Carrel', 'Jonathan Petersen', 'Jak Urlacher', 'Reo Ogundare', 'Daniel Drellishak', 'Grant Gogel', 'Riley Johnston', 'Cole Sheldon', 'Adam Blue', 'Daniel Affleck', 'Tristan McGarrah']


### Now let's quickly convert the 'Year' column in all our dataframes to something more workable.

In [6]:

def convert_year(value):
    value = str(value)
    if '1' in value:
        return 1
    elif '2' in value:
        return 2
    elif '3' in value:
        return 3
    else:
        return 4

# Iterate through each DataFrame in dataframes_lists and apply the conversion
for year, df in dataframes_lists.items():
    dataframes_lists[year]['Year'] = df['Year'].apply(convert_year)

# Repeat the process for dataframes_champs
for year, df in dataframes_champs.items():
    dataframes_champs[year]['Year'] = df['Year'].apply(convert_year)


### We'll create a dataframe for each athlete that meets our criteria storing every year they've competed  and their corresponding mark.

In [7]:
# Step 1: Initialize an empty dictionary for athlete DataFrames
athlete_dataframes = {}

# Step 2: Loop through each athlete in working_athletes
for athlete in working_athletes:
    # Step 3: Initialize an empty list for this athlete's records
    records = []
    
    # Step 4: Loop through each DataFrame in dataframes_lists
    for year, df in dataframes_lists.items():
        # Step 5: Filter for rows where the athlete's name matches
        athlete_records = df[df['Athlete'] == athlete]
        
        # Step 6: Extract 'Year' and 'Mark' columns and append to records list
        if not athlete_records.empty:
            for _, row in athlete_records.iterrows():
                records.append({'Year': row['Year'], 'Mark': row['Mark']})
    
    # Step 7: Convert the records list into a DataFrame
    athlete_df = pd.DataFrame(records)
    
    # Step 8: Add the DataFrame to the athlete_dataframes dictionary
    athlete_dataframes[athlete] = athlete_df

In [8]:
print(athlete_dataframes)

{'Kyle Campbell':    Year  Mark
0     1  5.00
1     2  5.06
2     3  4.95, 'Mitch Erickson':    Year  Mark
0     1  5.06
1     2  5.27
2     3  5.06, 'Derik Peterman':    Year  Mark
0     1  5.03
1     3  5.05
2     4  5.08, 'Sam Retzloff':    Year  Mark
0     1  4.90
1     2  5.01, 'Mitch Mammoser ':    Year  Mark
0     1  5.06
1     4  4.98
2     3  5.21
3     4  4.96, 'Christian Sanderfer':    Year  Mark
0     1  5.05
1     2  5.30
2     3  5.11
3     4  4.81, 'Steven Cahoy':    Year  Mark
0     1  5.40
1     2  5.40
2     3  5.22
3     4  5.31, 'Jesse Johnson':    Year  Mark
0     1  5.22
1     2  5.38, 'Tim Ehrhardt':    Year  Mark
0     1  5.17
1     2  5.31
2     3  5.31
3     4  5.41, 'Jed Fenske':    Year  Mark
0     1  4.72
1     2  4.95
2     3  5.11
3     4  4.91, 'Glen Harold':    Year  Mark
0     1  5.11
1     2  5.05
2     3  5.25
3     4  5.07, 'Noah Gary':    Year  Mark
0     1  5.01
1     2  5.21
2     3  5.35
3     4  5.23, 'Cole Gorski':    Year  Mark
0     1  5.01


### Looks a bit ugly in the given window, but if you look closely you see it's storing the data correctly.

### Now let's subtract each athlete's best mark from their first mark to find out how much they've improved.

### When we're done, we can analyze all the athletes to find the Big Ten men's pole vault average improvement.

In [11]:
# Step 1: Loop through each athlete in athlete_dataframes
meter_improvement_list = []
for athlete, df in athlete_dataframes.items():
    # Ensure the DataFrame is sorted by 'Year'
    df_sorted = df.sort_values(by='Year')
    
    # Step 2 & 3: Identify the first year and its mark
    first_year = df_sorted.iloc[0]['Year']
    first_year_mark = df_sorted.iloc[0]['Mark']
    
    # Step 4: Identify the best year and its mark
    best_mark = df_sorted['Mark'].max()
    best_year_df = df_sorted[df_sorted['Mark'] == best_mark]
    best_year = best_year_df.iloc[0]['Year']  # In case of ties, select the earliest year
    meter_improvement = (best_mark - first_year_mark) 
    meter_improvement_list.append(meter_improvement)
    if 'Jak Urlacher' in athlete:
        jak_improvement = meter_improvement
        
    # Step 5: Compare and print the results
    print(f"Athlete: {athlete}")
    print(f"First Year: {first_year}, Mark: {first_year_mark}")
    print(f"Best Year: {best_year}, Best Mark: {best_mark}")
    print(f"Meter Improvement: {best_mark - first_year_mark:.2f}")
    print("---")


Athlete: Kyle Campbell
First Year: 1.0, Mark: 5.0
Best Year: 2.0, Best Mark: 5.06
Meter Improvement: 0.06
---
Athlete: Mitch Erickson
First Year: 1.0, Mark: 5.06
Best Year: 2.0, Best Mark: 5.27
Meter Improvement: 0.21
---
Athlete: Derik Peterman
First Year: 1.0, Mark: 5.03
Best Year: 4.0, Best Mark: 5.08
Meter Improvement: 0.05
---
Athlete: Sam Retzloff
First Year: 1.0, Mark: 4.9
Best Year: 2.0, Best Mark: 5.01
Meter Improvement: 0.11
---
Athlete: Mitch Mammoser 
First Year: 1.0, Mark: 5.06
Best Year: 3.0, Best Mark: 5.21
Meter Improvement: 0.15
---
Athlete: Christian Sanderfer
First Year: 1.0, Mark: 5.05
Best Year: 2.0, Best Mark: 5.3
Meter Improvement: 0.25
---
Athlete: Steven Cahoy
First Year: 1.0, Mark: 5.4
Best Year: 1.0, Best Mark: 5.4
Meter Improvement: 0.00
---
Athlete: Jesse Johnson
First Year: 1.0, Mark: 5.22
Best Year: 2.0, Best Mark: 5.38
Meter Improvement: 0.16
---
Athlete: Tim Ehrhardt
First Year: 1.0, Mark: 5.17
Best Year: 4.0, Best Mark: 5.41
Meter Improvement: 0.24
---

In [13]:
# Step 6: Calculate the average centimeter improvement
average_meter_improvement = sum(meter_improvement_list) / len(meter_improvement_list)
print(f"Jak Urlacher's Improvement: {jak_improvement:.2f}")
print(f"Average Meter Improvement: {average_meter_improvement:.2f}")
print(f"Jak Urlacher's Improvement is {jak_improvement / average_meter_improvement:.2f} times the average improvement.")

Jak Urlacher's Improvement: 0.46
Average Meter Improvement: 0.20
Jak Urlacher's Improvement is 2.26 times the average improvement.


### Beautiful. Now let's send our work to a csv file for next time. 

In [14]:
import os

# Create a directory to save the CSV files if it doesn't already exist
output_dir = "athlete_csvs"
os.makedirs(output_dir, exist_ok=True)

# Loop through each athlete in athlete_dataframes
for athlete, df in athlete_dataframes.items():
    # Format the athlete's name to create a valid filename (remove spaces and special characters)
    filename = f"{athlete.replace(' ', '_').replace('.', '')}.csv"
    # Define the full path for the CSV file
    filepath = os.path.join(output_dir, filename)
    # Save the DataFrame to a CSV file
    df.to_csv(filepath, index=False)

print("All athlete DataFrames have been saved to CSV files.")

All athlete DataFrames have been saved to CSV files.
