In [109]:
import numpy as np
import pandas as pd
#You'll make a copy of this df in the first line of most functions
global_sdf = pd.read_csv('fgswing.csv') 

### Project: You will cluster baseball players via the principal components of their swing profile. We will look at the following 6 measurements:

In [70]:
swing_cols = ['O-Swing%','Z-Swing%','O-Contact%','Z-Contact%','Soft%','Hard%']

These are defined as:
1. O-Swing%: The percentage of times the batter swings at balls *out of* the strike zone

2. Z-Swing%: The percentage of times the batter swings at balls *in* the strike zone

3. O-Contact%: The percentage of times the batter swings and makes contact with balls *out of* the strike zone.

4. Z-Contact%: The percentage of times the batter swings and makes contact with balls *in* the strike zone.

5. Soft%: The percentage of times the batter makes soft contact.  

6. Hard%: The percentage of times the batter makes hard contact.

Some DataFrame things you might find helpful (I did).  

1. If you find yourself wanting to rename columns in a DataFrame, try [this](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html)
You can do this project without ever renaming a column, I just found it convenient in a couple places

2. In the function get_comps you should definitely use DataFrame's sorting command to [sort values by column](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html). Maybe you can do this without using sort_values but it would be ugly.


3. You will do a *lot* of DataFrame indexing in this project. In addition to what you learned for the last project the following type of indexing will be used often. As always check the documentation for all you could want to know about [indexing](https://pandas.pydata.org/docs/user_guide/indexing.html)

In [71]:
sdf = global_sdf.copy()
sdf.head()

Unnamed: 0,Name,Team,Season,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Soft%,Hard%
0,Fernando Tatis Jr.,SDP,2021,0.332579,0.806135,0.478458,0.802131,0.148936,0.480243
1,Aaron Judge,NYY,2021,0.270287,0.674877,0.566591,0.842336,0.085642,0.468514
2,Miguel Sanó,MIN,2021,0.294337,0.684642,0.43038,0.80137,0.149306,0.454861
3,Shohei Ohtani,LAA,2021,0.30148,0.719876,0.527607,0.772532,0.108571,0.451429
4,Joey Votto,CIN,2021,0.246575,0.714597,0.570988,0.815549,0.08,0.449231


In [72]:
sdf = sdf.loc[sdf['Season'] == 2022,['Season','Team','Soft%']]
sdf.head()

Unnamed: 0,Season,Team,Soft%
132,2022,NYY,0.075
133,2022,HOU,0.091644
134,2022,TOR,0.100865
135,2022,LAD,0.112462
136,2022,MIL,0.175182


# Your worked-through handout from Wednesday, Nov. 6 should be with you

Do not worry about putting error messages in your functions if a user does not for example enter a year in the correct range.  Assume these functions are only being accessed after put through some error-message functions first.  In each function we make a copy of the global_sdf so you can freely do whatever you want to sdf without
messing up global_sdf.

In [None]:
def data_to_svd(year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy() 
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    Three 2D arrays : The U,S,Vh values in the SVD of the mean_zero version of the
                      swing_cols data
    """
    # Creates local copy of global DataFrame
    sdf = global_sdf.copy()
    
    # *Inner sdf* creates Boolean mask where value is true or false for each row 
    #   (depending on if the season matches param year)
    # *Outer sdf* applies the mask to filter the dataframe
    year_data = sdf[sdf['Season'] == year]
    
    # Contains only 6 columns specified above under the given year
    A = year_data[swing_cols]
    
    # Normalizes mean of each column to center at 0
    A = A - np.mean(A, axis=0)
    
    U, S, Vh = np.linalg.svd(A, full_matrices=False)
    return U, S, Vh

In [None]:
def make_pcdf():
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    
    Returns
    -------
    DataFrame with 8 columns: Name,Year, p1,...,p6. Below 'Name' is the player's 
    Name. Below 'Year' is the corresponding Year. Below each 'pi' is the weighted 
    ith principal component (i.e. the U values from 
    the mean_zero version of the swing_cols data) for the player in the given year
    """
    
    # Creates local copy of global variable
    sdf = global_sdf.copy()
    
    # Creates 2D Array consisting of all unique player names in 'sdf'
    names = list(sdf['Name'].unique())
    
    # The columns that will be in the resulting dataframe
    pc_columns = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6']
    
    # Adds columns 'Name' and 'Season' to 'pc_columns' that will be the final 
    #   list of columns for the matrix
    result_df = pd.DataFrame(columns=['Name', 'Season'] + pc_columns)

    # Outer loop runs through each player
    for name in names: 
        
        # Inner loop iterates through each played season for the respective player
        for year in range(2021, 2025):
            
            # Computes SVD for player for a given year:
            U, S, Vh = data_to_svd(sdf.loc[sdf['Name'] == name, 'Season'].iloc[0])
            # - sdf['Name'] == name creates a boolean mask for rows where the name matches
            # - sdf.loc[sdf['Name'] == name, 'Season'] gets the 'Season' column value for that player
            # - .iloc[0] gets the first season found for that player.
            
            
            # pc_df is a DataFrame consisting of the first 6 columns of U labeled p1 - p6.
            # - Ecah row represents a player's principal components.
            # - These values represent transformed versions of the original swing metrics that capture 
            #   the main patterns in the data.
            pc_df = pd.DataFrame(U[:, :6], columns=pc_columns)
            
            # Fills the rest of the columns with the player's name and the year, respectively.
            pc_df['Name'] = name
            pc_df['Season'] = year
            
            # Appends the two DataFrames together vertically, with result_df containing 
            #   all previous players/seasons and pc_df containing the new DataFrame with the current
            #   principal components.
            # - ignore_index=True tells pandas to create new sequential index numbers instead of keeping
            #   the original indices.
            result_df = pd.concat([result_df, pc_df], ignore_index=True)
            
            
            '''
            If result_df initially looks like:
                      Name    Season       p1        p2        p3        p4        p5        p6
                0     Judge     2021   -0.198988 -0.030338 -0.095613  0.025153 -0.083895  0.156286

            And pc_df contains:
                      Name    Season       p1        p2        p3        p4        p5        p6
                0     Ohtani    2021   -0.090424  0.107734 -0.173920  0.112534 -0.030673 -0.102064

            After concat, result_df becomes:
                      Name    Season       p1        p2        p3        p4        p5        p6
                0     Judge     2021   -0.198988 -0.030338 -0.095613  0.025153 -0.083895  0.156286
                1     Ohtani    2021   -0.090424  0.107734 -0.173920  0.112534 -0.030673 -0.102064
            '''
            
    return result_df
        

In [75]:
make_pcdf()

  result_df = pd.concat([result_df, pc_df], ignore_index=True)


Unnamed: 0,Name,Season,p1,p2,p3,p4,p5,p6
0,Fernando Tatis Jr.,2021,-0.198988,-0.030338,-0.095613,0.025153,-0.083895,0.156286
1,Fernando Tatis Jr.,2021,-0.090424,0.107734,-0.173920,0.112534,-0.030673,-0.102064
2,Fernando Tatis Jr.,2021,-0.187508,0.117021,0.025111,0.143597,-0.064799,0.055129
3,Fernando Tatis Jr.,2021,-0.146201,0.049554,-0.089149,0.051856,0.088210,-0.068006
4,Fernando Tatis Jr.,2021,-0.100207,0.098056,-0.176726,-0.013767,0.014774,-0.103709
...,...,...,...,...,...,...,...,...
132559,Sal Frelick,2024,-0.025128,0.174495,0.122207,-0.005754,0.029738,-0.023224
132560,Sal Frelick,2024,-0.246239,0.142924,-0.049490,-0.041181,-0.050690,0.141066
132561,Sal Frelick,2024,-0.132060,-0.033879,0.189763,-0.032186,0.000869,0.133820
132562,Sal Frelick,2024,-0.189022,0.112016,0.098147,-0.001203,-0.032375,-0.011168


Let's see how variation percentages have changed over the last 4 years (2020 was a covid-shortened season so 4 years is a natural cutoff)

In [None]:
def make_vardf():
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    
    Returns
    -------
    DataFrame with 6 columns: Year, var1,...,var5.  Below 'Year' is the year and 
    below 'vari'is the percentage of total variance due to the first i principal 
    components corresponding to that year.
    
    """
    sdf = global_sdf.copy()
    years = range(2021, 2025)
    var_data = []
    
    for year in years:     
        # Compute the S matrix for the given year passed as a parameter.
        # - These are the diagonal singular values.
        _, S, _ = data_to_svd(year)
        
        # These serve to calculate variance, through the squares singular value (S^2)
        #   and the sum of all squared values (total_var)
        ssv = S**2
        total_var = np.sum(ssv)
        
        # Gets the running total of variances
        #   e.g. if ssv was [100,50,30,10,5], csum would be [100,150,180,...]
        csum = np.cumsum(ssv[:5])
        total_sum = np.sum(ssv)

        # Converts the sums to percentages, as given by the formula in the handout.
        percentages = (csum / total_var) * 100
        
        row_data = {
            'Year': year,
            'var1': percentages[0],
            'var2': percentages[1],
            'var3': percentages[2],
            'var4': percentages[3],
            'var5': percentages[4]
        }
        var_data.append(row_data)
        
        '''
            The resulting dataframe is of the form:
            Year     var1    var2    var3    var4    var5
            0  2021  54.086  75.848  85.937  93.052  97.213
            1  2022  51.838  74.969  85.784  92.285  97.028
            ...
        '''
        
    return pd.DataFrame(var_data)


In [77]:
def make_vardf2():
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    
    Returns
    -------
    DataFrame with 6 columns: Year, var1,...,var5.  Below 'Year' is the year and 
    below 'vari'is the percentage of total variance due to the first i principal 
    components corresponding to that year.
    """
    sdf = global_sdf.copy()
    years = range(2021, 2025)
    var_data = []
    
    for year in years:
        year_data = sdf[sdf['Season'] == year]
        
        A = year_data[swing_cols]
        
        A = A - np.mean(A, axis=0)
        
        _, S, _ = np.linalg.svd(A, full_matrices=False)
        
        singular_values_squared = S**2
        
        cumulative_sums = np.cumsum(singular_values_squared[:5])
        
        total_sum = np.sum(singular_values_squared)
        variance_percentages = (cumulative_sums / total_sum) * 100
        
        row_data = {
            'Year': year,
            'var1': variance_percentages[0],
            'var2': variance_percentages[1],
            'var3': variance_percentages[2],
            'var4': variance_percentages[3],
            'var5': variance_percentages[4]
        }
        var_data.append(row_data)
        
    return pd.DataFrame(var_data)

In [78]:
make_vardf()

Unnamed: 0,Year,var1,var2,var3,var4,var5
0,2021,54.086915,75.848217,85.937188,93.052788,97.213575
1,2022,51.838264,74.969649,85.784993,92.285136,97.028966
2,2023,49.271701,73.875661,86.083116,92.579105,97.073431
3,2024,54.91724,76.933818,87.744412,93.273324,97.499447


In [None]:
def get_comps(player,year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    player : string , name of a player
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    A python list of 3 player names, for example ['Bobby Witt','Steven Kwan','Spencer Steer']
    For the given player find the 3 players whose first three U coordinates are closest to
    the player's and return those names in a list.  There are 6 total U coordinates we will
    only consider the first three.
    """
    sdf = global_sdf.copy()
    #I found it easiest to make a 2 column df.  One column
    #with the player names and the second column had the distances between each player and 
    #the given player. Then I sorted this
    #df using sort_values to find the three closest
    
    # *year_data* gets data for specified year and *resets index*:
    # - It filters in the rows with Season matching year parameter
    #   reset_index() creates clean sequential indices, avoiding index-related errors
    # - How? IDFK.
    year_data = sdf[sdf['Season'] == year]
    year_data = year_data.reset_index(drop=True) # fixes an error in generating report
    
    # Next grabs the principal components for specified year:
    # Keeps only the first 3 columns of U, which are the most important
    # - so if U was 100x6, it turns into 100x3.
    U, _, _ = data_to_svd(year)
    U_3 = U[:, :3]
    
    # Next gets target player data:
    # - First finds the index of the specified player in year_data
    # - Then gets that player's 3 principal components from U_3
    plr_index = year_data[year_data['Name'] == player].index[0]
    plr_crd = U_3[plr_index]
    
    # Calculates distances by performing principal component analysis:
    # - U_3 - plr_crd subtracts player's coords from everyone's coords
    # - **2 squares the differences
    # - np.sum(..., axis=1) sums across columns (the squared differences)
    # - np.sqrt takes the root, i.e., calculating Euclidean distance btwn target player
    #   and every other player.
    distances = np.sqrt(np.sum((U_3 - plr_crd)**2, axis=1)) # unsure if correct

    # Creates DataFrame with results:
    # - One column for player names
    # - One column for their distances from the target player
    comp_df = pd.DataFrame({
        'Name': year_data['Name'],
        'distance': distances
    })
    
    # Compiles the final results
    # - Removes target player from comparisons
    # - Sorts by distance (closest players first)
    # - Returns list of names of 3 closest players
    comp_df = comp_df[comp_df['Name'] != player].sort_values('distance')
    return list(comp_df['Name'].head(3))



In [105]:
get_comps("Bryce Harper", 2021)

['Shohei Ohtani', 'Vladimir Guerrero Jr.', 'Ryan McMahon']

Don't do anything with the next function.  It just isolates some annoying DataFrame stuff needed for cluster_report

In [106]:
def comp_df(year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    A DataFrame with index the 'Name' column of dataframe, 4 columns labelled 
    'Comp1','Comp2','Comp3'and empty entries.  
    """
    sdf = global_sdf.copy()
    sdf = sdf.loc[sdf['Season'] == year]
    names = list(sdf['Name'])
    m = len(names)
    A = np.empty((m,4),dtype=object)
    df = pd.DataFrame(A,columns=['Name','Comp1','Comp2','Comp3'])
    df['Name'] = names
    return df

Finally let's make a report with no numbers in it for our colleagues who are less numerically inclined. We will list all players in given year and then list their three closest comparable players or 'comps'

In [107]:
def cluster_report(year):
    """
    Parameters
    ----------
    global  : DataFrame sdf = global_sdf.copy()
    year : int in range 2021-2024 inclusive
    
    Returns
    -------
    Dataframe with 5 columns : Name,Year,Comp1,Comp2,Comp3.  'Name' and 'Year' are the name
    of a player and the year respectively.  comp1,comp2 and comp3 should be the three 
    closest players as calculated in get_comps.
    
    """
    sdf = global_sdf.copy()
    rep_df = comp_df(year)
    #Now fill in rep_df by looping through the names and computing the comp
    #This might take a minute to run, I chose clarity over efficiency...
    plrs = sdf[sdf['Season'] == year]['Name'].unique()
    rep_df['Season'] = year
    for plr in plrs:
        comps = get_comps(plr, year)
        index = rep_df[rep_df['Name'] == plr].index[0]
        rep_df.loc[index, ['Comp1', 'Comp2', 'Comp3']] = comps
    return rep_df[['Name', 'Season', 'Comp1', 'Comp2', 'Comp3']]

In [108]:
cluster_report(2022)

Unnamed: 0,Name,Season,Comp1,Comp2,Comp3
0,Aaron Judge,2022,Matt Chapman,Ryan McMahon,Yordan Alvarez
1,Yordan Alvarez,2022,Mookie Betts,Matt Chapman,Taylor Ward
2,Teoscar Hernández,2022,Dansby Swanson,Christian Walker,Ryan McMahon
3,Max Muncy,2022,Matt Chapman,Tommy Pham,Kyle Schwarber
4,Rowdy Tellez,2022,Josh Rojas,Paul Goldschmidt,Taylor Ward
...,...,...,...,...,...
125,Adam Frazier,2022,Nico Hoerner,Yuli Gurriel,Tony Kemp
126,Miguel Rojas,2022,Thairo Estrada,Starling Marte,Austin Hays
127,Isiah Kiner-Falefa,2022,J.P. Crawford,Tony Kemp,Miguel Rojas
128,Tony Kemp,2022,Adam Frazier,Yuli Gurriel,Nico Hoerner
