In [3]:
import numpy as np
import pandas as pd
#You'll make a copy of this df in the first line of most functions
global_sdf = pd.read_csv('fgswing.csv') 

### Project: You will cluster baseball players via the principal components of their swing profile. We will look at the following 6 measurements:

In [4]:
swing_cols = ['O-Swing%','Z-Swing%','O-Contact%','Z-Contact%','Soft%','Hard%']

These are defined as:
1. O-Swing%: The percentage of times the batter swings at balls *out of* the strike zone

2. Z-Swing%: The percentage of times the batter swings at balls *in* the strike zone

3. O-Contact%: The percentage of times the batter swings and makes contact with balls *out of* the strike zone.

4. Z-Contact%: The percentage of times the batter swings and makes contact with balls *in* the strike zone.

5. Soft%: The percentage of times the batter makes soft contact.  

6. Hard%: The percentage of times the batter makes hard contact.

Some DataFrame things you might find helpful (I did).  

1. If you find yourself wanting to rename columns in a DataFrame, try [this](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html)
You can do this project without ever renaming a column, I just found it convenient in a couple places

2. In the function get_comps you should definitely use DataFrame's sorting command to [sort values by column](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html). Maybe you can do this without using sort_values but it would be ugly.


3. You will do a *lot* of DataFrame indexing in this project. In addition to what you learned for the last project the following type of indexing will be used often. As always check the documentation for all you could want to know about [indexing](https://pandas.pydata.org/docs/user_guide/indexing.html)

In [5]:
sdf = global_sdf.copy()
sdf.head()

Unnamed: 0,Name,Team,Season,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Soft%,Hard%
0,Fernando Tatis Jr.,SDP,2021,0.332579,0.806135,0.478458,0.802131,0.148936,0.480243
1,Aaron Judge,NYY,2021,0.270287,0.674877,0.566591,0.842336,0.085642,0.468514
2,Miguel Sanó,MIN,2021,0.294337,0.684642,0.43038,0.80137,0.149306,0.454861
3,Shohei Ohtani,LAA,2021,0.30148,0.719876,0.527607,0.772532,0.108571,0.451429
4,Joey Votto,CIN,2021,0.246575,0.714597,0.570988,0.815549,0.08,0.449231


In [6]:
sdf = sdf.loc[sdf['Season'] == 2022,['Season','Team','Soft%']]
sdf.head()

Unnamed: 0,Season,Team,Soft%
132,2022,NYY,0.075
133,2022,HOU,0.091644
134,2022,TOR,0.100865
135,2022,LAD,0.112462
136,2022,MIL,0.175182


# Your worked-through handout from Wednesday, Nov. 6 should be with you

Do not worry about putting error messages in your functions if a user does not for example enter a year in the correct range.  Assume these functions are only being accessed after put through some error-message functions first.  In each function we make a copy of the global_sdf so you can freely do whatever you want to sdf without
messing up global_sdf.

In [7]:
def data_to_svd(year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy() 
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    Three 2D arrays : The U,S,Vh values in the SVD of the mean_zero version of the
                      swing_cols data
    """
    sdf = global_sdf.copy()
    A = sdf.loc[:, swing_cols]
    A = A - np.mean(A, axis=0)
    U, S, Vh = np.linalg.svd(A, full_matrices=False)
    return U, S, Vh

In [22]:
def make_pcdf():
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    
    Returns
    -------
    DataFrame with 8 columns: Name,Year, p1,...,p6. Below 'Name' is the player's 
    Name. Below 'Year' is the corresponding Year. Below each 'pi' is the weighted 
    ith principal component (i.e. the U values from 
    the mean_zero version of the swing_cols data) for the player in the given year
    """
    sdf = global_sdf.copy()
    names = list(sdf['Name'].unique())
    pc_columns = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6']
    result_df = pd.DataFrame(columns=['Name', 'Season'] + pc_columns)

    
    for name in names: 
        for year in range(2021, 2025):
            U, S, Vh = data_to_svd(sdf.loc[sdf['Name'] == name, 'Season'].iloc[0])
            pc_df = pd.DataFrame(U[:, :6], columns=pc_columns)
            pc_df['Name'] = name
            pc_df['Season'] = year
            result_df = pd.concat([result_df, pc_df], ignore_index=True)
    return result_df
        

In [23]:
make_pcdf()

  result_df = pd.concat([result_df, pc_df], ignore_index=True)


Unnamed: 0,Name,Season,p1,p2,p3,p4,p5,p6
0,Fernando Tatis Jr.,2021,-0.107335,0.006287,-0.054128,0.002964,0.020977,0.088561
1,Fernando Tatis Jr.,2021,-0.048419,0.071265,-0.072187,-0.046452,0.043092,-0.037985
2,Fernando Tatis Jr.,2021,-0.098704,0.072125,0.021912,-0.049039,0.045179,0.047362
3,Fernando Tatis Jr.,2021,-0.077599,0.044220,-0.041571,-0.033730,-0.035713,-0.024721
4,Fernando Tatis Jr.,2021,-0.053022,0.066080,-0.076978,0.010095,-0.002770,-0.052122
...,...,...,...,...,...,...,...,...
529195,Sal Frelick,2024,0.011363,-0.086696,0.058488,0.021138,-0.001877,0.002820
529196,Sal Frelick,2024,0.123350,-0.069894,-0.030302,-0.027598,-0.000495,-0.061997
529197,Sal Frelick,2024,0.066687,0.015352,0.090867,0.009871,-0.015286,-0.061458
529198,Sal Frelick,2024,0.095050,-0.055724,0.042678,0.005155,0.010504,0.004711


Let's see how variation percentages have changed over the last 4 years (2020 was a covid-shortened season so 4 years is a natural cutoff)

In [None]:
def make_vardf():
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    
    Returns
    -------
    DataFrame with 6 columns: Year, var1,...,var5.  Below 'Year' is the year and 
    below 'vari'is the percentage of total variance due to the first i principal 
    components corresponding to that year.
    
    """
    sdf = global_sdf.copy()

In [None]:
def get_comps(player,year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    player : string , name of a player
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    A python list of 3 player names, for example ['Bobby Witt','Steven Kwan','Spencer Steer']
    For the given player find the 3 players whose first three U coordinates are closest to
    the player's and return those names in a list.  There are 6 total U coordinates we will
    only consider the first three.
    """
    sdf = global_sdf.copy()
    #I found it easiest to make a 2 column df.  One column
    #with the player names and the second column had the distances between each player and 
    #the given player. Then I sorted this
    #df using sort_values to find the three closest

Don't do anything with the next function.  It just isolates some annoying DataFrame stuff needed for cluster_report

In [None]:
def comp_df(year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    A DataFrame with index the 'Name' column of dataframe, 4 columns labelled 
    'Comp1','Comp2','Comp3'and empty entries.  
    """
    sdf = global_sdf.copy()
    sdf = sdf.loc[sdf['Season'] == year]
    names = list(sdf['Name'])
    m = len(names)
    A = np.empty((m,4),dtype=object)
    df = pd.DataFrame(A,columns=['Name','Comp1','Comp2','Comp3'])
    df['Name'] = names
    return df

Finally let's make a report with no numbers in it for our colleagues who are less numerically inclined. We will list all players in given year and then list their three closest comparable players or 'comps'

In [None]:
def cluster_report(year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    Dataframe with 5 columns : Name,Year,Comp1,Comp2,Comp3.  'Name' and 'Year' are the name
    of a player and the year respectively.  comp1,comp2 and comp3 should be the three 
    closest players as calculated in get_comps.
    
    """
    sdf = global_sdf.copy()
    rep_df = comp_df(year)
    #Now fill in rep_df by looping through the names and computing the comp
    #This might take a minute to run, I chose clarity over efficiency...