# C04. Players

In [1]:
%run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"

### Goals:
- Compare my FP projections to actual FP scored
    - Overall
    - Lefty/Righty
    - By Park/Weather
    - By imputation status
    - By starting pitcher status
    - By year
    - By projection decile (are low scores too low, are they too high?)
- Compare my FP projections to other FP projections
- Compare my scoring component projections to actual scoring component scoring (projected singles vs. actual singles, etc...)

### Dates

In [2]:
start_date = "20240318"
# start_date = yesterdaysdate
end_date = yesterdaysdate
# end_date = "20240518"

### Games

In [3]:
game_df = read_and_save_games(team_map, generate=True)
game_df = game_df[(game_df['date'] >= start_date) & (game_df['date'] <= end_date)].reset_index(drop=True)

In [4]:
date_list = list(game_df['date'].unique())
date_folders = [f"Matchups {date}" for date in date_list]

game_list = list(game_df['game_id'].unique())
player_folders = [f"Players {game}" for game in game_list]

### Create Player Stat Dataframe

Extract date, teams, and gamePk from folder names

In [5]:
def extract_info_from_folder(date_folder, matchup_folder):
    # Extract date from date folder
    date = date_folder.split(' ')[1]
    
    # Extract teams and gamePK from matchup folder
    parts = matchup_folder.split(' ')
    away_team, home_team = parts[0].split('@')
    gamePk = parts[1]
    
    return date, away_team, home_team, gamePk

Average player stats for a given position group

In [6]:
def game_averages(date_folder, matchup_folder, position='pitchers'):
    date, away_team, home_team, gamePk = extract_info_from_folder(date_folder, matchup_folder)
    
    # Initialize an empty list to store DataFrames
    dfs = []
    
    # Get a list of all CSV files in the matchup folder
    csv_files = [file for file in os.listdir(os.path.join(baseball_path, "B02. Simulations", "2. Player Sims", date_folder, matchup_folder)) if file.startswith(position) and file.endswith('.csv')]
    
    # Iterate over each CSV file
    for csv_file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(baseball_path, "B02. Simulations", "2. Player Sims", date_folder, matchup_folder, csv_file))
        
        # Append date, away_team, home_team, and gamePk columns
        df['date'] = date
        df['away_team'] = away_team
        df['home_team'] = home_team
        df['gamePk'] = gamePk
        
        # Append the DataFrame to the list
        dfs.append(df)
    
    # Concatenate all DataFrames in the list
    combined_df = pd.concat(dfs)
    
    # Select numeric columns
    numeric_cols = combined_df.select_dtypes(include='number')
    
    # Group by fullName and calculate the mean for numeric columns
    averaged_numeric_cols = numeric_cols.groupby(combined_df['fullName'], sort=False).mean()
    
    # Select team and additional columns
    additional_cols = combined_df[['fullName', 'team', 'date', 'away_team', 'home_team', 'gamePk']].drop_duplicates().set_index('fullName')
    
    # Concatenate numeric and additional columns
    averaged_df = pd.concat([additional_cols, averaged_numeric_cols], axis=1).reset_index()

    averaged_df['team_abbrev'] = np.where(averaged_df['team'] == "away", averaged_df['away_team'], averaged_df['home_team'])

    averaged_df['starter'] = (~averaged_df['team'].duplicated()).astype(int)

    return averaged_df

### Player Average Projections

##### Batters

Calculate averages

In [None]:
%%time
position = 'batters'

# Parallelize the loop using joblib and directly return df_list
batter_df_list = Parallel(n_jobs=-1)(
    delayed(game_averages)(date_folder, matchup_folder, position) 
    for date_folder in date_folders 
    for matchup_folder in os.listdir(os.path.join(baseball_path, "B02. Simulations", "2. Player Sims", date_folder)))

Write to CSV

In [None]:
for df in batter_df_list:
    gamePk = df['gamePk'][0]
    away_df = df.query('team == "away"')
    home_df = df.query('team == "home"')
    
    # Create folder
    os.makedirs(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}"), exist_ok=True)

    # Write to csv
    away_df.to_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"away batters projections {gamePk}.csv"), index=False)
    home_df.to_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"home batters projections {gamePk}.csv"), index=False)

##### Pitchers

Calculate averages

In [None]:
%%time
position = 'pitchers'

# Parallelize the loop using joblib and directly return df_list
pitcher_df_list = Parallel(n_jobs=-1)(
    delayed(game_averages)(date_folder, matchup_folder, position) 
    for date_folder in date_folders 
    for matchup_folder in os.listdir(os.path.join(baseball_path, "B02. Simulations", "2. Player Sims", date_folder)))

Write to CSV

In [None]:
for df in pitcher_df_list:
    gamePk = df['gamePk'][0]
    away_df = df.query('team == "away"')
    home_df = df.query('team == "home"')
    
    # Create folder
    os.makedirs(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}"), exist_ok=True)

    # Write to csv
    away_df.to_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"away pitchers projections {gamePk}.csv"), index=False)
    home_df.to_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"home pitchers projections {gamePk}.csv"), index=False)

### Evaluate Projections

In [None]:
# Define a function to process each folder
def process_batters(folder):
    # Extract gamePk
    gamePk = folder.split(" ")[1]

    ### Batters
    ## Away
    # Read in projections
    away_batter_projected_results_df = pd.read_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"away batters projections {gamePk}.csv"))
    # Read in results
    away_batter_actual_results_df = pd.read_csv(os.path.join(baseball_path, "A10. Player Results", f"Player Results {gamePk}", f"away batters {gamePk}.csv"))

    # Merge
    away_batters_merged = away_batter_projected_results_df[['fullName', 'id', 'imp_b_l', 'imp_b_r', 'PA', 'HBP', 'BB', 'B1', 'B2', 'B3', 'HR', 'SB', 'R', 'RBI', 'FP', 'gamePk']].merge(away_batter_actual_results_df, left_on=['id', 'gamePk'], right_on=['personId', 'gamePk'], how='outer')

    ## Home
    # Read in projections
    home_batter_projected_results_df = pd.read_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"home batters projections {gamePk}.csv"))
    # Read in results
    home_batter_actual_results_df = pd.read_csv(os.path.join(baseball_path, "A10. Player Results", f"Player Results {gamePk}", f"home batters {gamePk}.csv"))

    # Merge
    home_batters_merged = home_batter_projected_results_df[['fullName', 'id', 'imp_b_l', 'imp_b_r', 'PA', 'HBP', 'BB', 'B1', 'B2', 'B3', 'HR', 'SB', 'R', 'RBI', 'FP', 'gamePk']].merge(home_batter_actual_results_df, left_on=['id', 'gamePk'], right_on=['personId', 'gamePk'], how='outer')

    # Append them together
    batters_merged = pd.concat([away_batters_merged, home_batters_merged], axis=0)

    
    return batters_merged

In [None]:
def process_batters2(folder):
    try:
        batters_merged = process_batters(folder)    
        return batters_merged   
    except:
        pass 

In [None]:
%%time
# Run the loop in parallel
batters_merged_list = Parallel(n_jobs=-1)(delayed(process_batters2)(folder) for folder in player_folders)
batters_merged_df = pd.concat(batters_merged_list, axis=0)

In [None]:
def process_pitchers(folder):
    # Extract gamePk
    gamePk = folder.split(" ")[1]
    
    ### Pitchers
    ## Away
    # Read in projections
    away_pitcher_projected_results_df = pd.read_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"away pitchers projections {gamePk}.csv"))
    away_pitcher_projected_results_df['team'] = "away"
    # Read in results
    away_pitcher_actual_results_df = pd.read_csv(os.path.join(baseball_path, "A10. Player Results", f"Player Results {gamePk}", f"away pitchers {gamePk}.csv"))

    # Merge
    away_pitchers_merged = away_pitcher_projected_results_df[['fullName', 'id', 'imp_p_l', 'imp_p_r', 'OUT', 'PA', 'SO', 'HBP', 'BB', 'B1', 'B2', 'B3', 'HR', 'H', 'R', 'ER', 'W', 'CG', 'CGSO', 'NH', 'FP', 'team', 'gamePk']].merge(away_pitcher_actual_results_df, left_on=['id', 'gamePk', 'team'], right_on=['personId', 'gamePk', 'team'], how='outer')
    
    ## Home
    # Read in projections
    home_pitcher_projected_results_df = pd.read_csv(os.path.join(baseball_path, "C04. Players", f"Players {gamePk}", f"home pitchers projections {gamePk}.csv"))
    home_pitcher_projected_results_df['team'] = "home"
    # Read in results
    home_pitcher_actual_results_df = pd.read_csv(os.path.join(baseball_path, "A10. Player Results", f"Player Results {gamePk}", f"home pitchers {gamePk}.csv"))

    # Merge
    home_pitchers_merged = home_pitcher_projected_results_df[['fullName', 'id', 'imp_p_l', 'imp_p_r', 'OUT', 'PA', 'SO', 'HBP', 'BB', 'B1', 'B2', 'B3', 'HR', 'H', 'R', 'ER', 'W', 'CG', 'CGSO', 'NH', 'FP', 'team', 'gamePk']].merge(home_pitcher_actual_results_df, left_on=['id', 'gamePk', 'team'], right_on=['personId', 'gamePk', 'team'], how='outer')

    # Append them together
    pitchers_merged = pd.concat([away_pitchers_merged, home_pitchers_merged], axis=0)

    
    return pitchers_merged

In [None]:
def process_pitchers2(folder):
    try:
        pitchers_merged = process_pitchers(folder)    
        return pitchers_merged   
    except:
        pass 

In [None]:
%%time
# Run the loop in parallel
pitchers_merged_list = Parallel(n_jobs=-1)(delayed(process_pitchers2)(folder) for folder in player_folders)
pitchers_merged_df = pd.concat(pitchers_merged_list, axis=0)

### Batters

##### Create New Variables

In [None]:
# Actual singles
batters_merged_df['singles'] = batters_merged_df['h'] - batters_merged_df['doubles'] - batters_merged_df['triples'] - batters_merged_df['hr']
# Actual PA
batters_merged_df['pa'] = batters_merged_df[['ab', 'bb', 'hbp']].sum(axis=1)
# Projected hits
batters_merged_df['H'] = batters_merged_df[['B1', 'B2', 'B3', 'HR']].sum(axis=1)
# Reached
batters_merged_df['ON'] = batters_merged_df[['H', 'BB', 'HBP']].sum(axis=1)
batters_merged_df['on'] = batters_merged_df[['h', 'bb', 'hbp']].sum(axis=1)

In [None]:
# Identify number of batters that batted in a given spot in the order (we may only want those who were never subbed out or are subs)
batters_merged_df['battingSpot'] = batters_merged_df['battingOrder'] // 100
batters_merged_df['battersSpot'] = batters_merged_df.groupby(['gamePk', 'team', 'battingSpot'])['battingSpot'].transform('count')

##### Starters - Never Subbed

In [None]:
starting_batters = batters_merged_df.query('substitution == False').query('battersSpot == 1')[['PA', 'pa', 'ON', 'on', 'H', 'h', 'B1', 'singles', 'B2', 'doubles', 'B3', 'triples', 'HR', 'hr', 'BB', 'bb', 'SB', 'sb', 'FP', 'fp']].agg(['mean', 'sum'])

# Split the dataframe into projected (even rows) and actual (odd rows)
projected = starting_batters.T.iloc[::2].reset_index()
actual = starting_batters.T.iloc[1::2].reset_index()

# Concatenate the two dataframes side-by-side
starting_batters = pd.concat([projected, actual], axis=1)

# Rename the columns
starting_batters.columns = ["Projected", "Projected Mean", 'Projected Sum', "Actual", "Actual Mean", 'Actual Sum']

starting_batters

##### Teams

In [None]:
team_batters = batters_merged_df.groupby(['gamePk', 'team']).sum(numeric_only=True)[['PA', 'pa', 'ON', 'on', 'H', 'h', 'B1', 'singles', 'B2', 'doubles', 'B3', 'triples', 'HR', 'hr', 'BB', 'bb', 'HBP', 'hbp', 'R', 'r', 'RBI', 'rbi', 'SB', 'sb', 'FP', 'fp']].agg(['mean', 'sum'])

# Split the dataframe into projected (even rows) and actual (odd rows)
projected = team_batters.T.iloc[::2].reset_index()
actual = team_batters.T.iloc[1::2].reset_index()

# Concatenate the two dataframes side-by-side
team_batters = pd.concat([projected, actual], axis=1)

# Rename the columns
team_batters.columns = ["Projected", "Projected Mean", 'Projected Sum', "Actual", "Actual Mean", 'Actual Sum']

team_batters

##### Scaled

In [None]:
actual_pa_mean = batters_merged_df.groupby(['gamePk', 'team']).sum(numeric_only=True)['pa'].mean()
projected_pa_mean = batters_merged_df.groupby(['gamePk', 'team']).sum(numeric_only=True)['PA'].mean()

In [None]:
# Define the fraction
fraction = actual_pa_mean/projected_pa_mean

# Select the columns you want to multiply and multiply them by the fraction
columns_to_multiply = ['PA', 'H', 'B1', 'B2', 'B3', 'HR', 'BB', 'HBP', 'R', 'RBI', 'SB', 'FP']
batters_merged_df_scaled = batters_merged_df.copy()
batters_merged_df_scaled[columns_to_multiply] = batters_merged_df[columns_to_multiply] * fraction

In [None]:
scaled_batters = batters_merged_df_scaled.groupby(['gamePk', 'team']).sum(numeric_only=True)[['PA', 'pa', 'ON', 'on', 'H', 'h', 'B1', 'singles', 'B2', 'doubles', 'B3', 'triples', 'HR', 'hr', 'BB', 'bb', 'HBP', 'hbp', 'R', 'r', 'RBI', 'rbi', 'SB', 'sb', 'FP', 'fp']]

# Convert to DF
scaled_batters = pd.DataFrame(scaled_batters.mean().reset_index())

# Split the dataframe into projected (even rows) and actual (odd rows)
projected = scaled_batters.iloc[::2].reset_index(drop=True)
actual = scaled_batters.iloc[1::2].reset_index(drop=True)

# Concatenate the two dataframes side-by-side
scaled_batters = pd.concat([projected, actual], axis=1)

# Rename the columns
scaled_batters.columns = ["Projected", "Projected Value", "Actual", "Actual Value"]

scaled_batters

In [None]:
# Consider tracking if players were never removed from game
# Consider merging on innings and only looking at full games

### Pitchers

##### Create New Variables

In [None]:
pitchers_merged_df.sort_values(['gamePk', 'team', 'date'], inplace=True)

In [None]:
pitchers_merged_df['personId'].fillna(pitchers_merged_df['id'], inplace=True)
pitchers_merged_df['name'].fillna(pitchers_merged_df['fullName'], inplace=True)

for col in ['starter', 'ip', 'outs', 'h', 'r', 'er', 'bb', 'k', 'hr', 'hbp', 'w', 'l', 'cg', 'cgso', 'nh', 'fp']:
    pitchers_merged_df[col].fillna(0, inplace=True)
    
for col in ['date', 'year', 'venue_id', 'team', 'teamabbrev']:
    # pitchers_merged_df.sort_values(['date', 'year', 'venue_id', 'team', 'teamabbrev'], ascending=False, inplace=True)
    pitchers_merged_df[col].fillna(method='ffill', inplace=True)

##### Starters

In [None]:
starting_pitchers = pitchers_merged_df.dropna().query('starter == 1')[['OUT', 'outs', 'PA', 'pa', 'ER', 'er', 'R', 'r', 'H', 'h', 'SO', 'k', 'W', 'w', 'FP', 'fp']].agg(['mean', 'sum'])

# Split the dataframe into projected (even rows) and actual (odd rows)
projected = starting_pitchers.T.iloc[::2].reset_index()
actual = starting_pitchers.T.iloc[1::2].reset_index()

# Concatenate the two dataframes side-by-side
starting_pitchers = pd.concat([projected, actual], axis=1)

# Rename the columns
starting_pitchers.columns = ["Projected", "Projected Mean", 'Projected Sum', "Actual", "Actual Mean", 'Actual Sum']

starting_pitchers

##### Scaled

This calculates how starting pitchers would do if they went as long as they were supposed to. <br>
Note: outs should have a nonlinear relationship with wins, so this won't be exactly right, but close enough. 

In [None]:
actual_outs_mean = pitchers_merged_df.dropna().query('starter == 1')['outs'].mean()
projected_outs_mean = pitchers_merged_df.dropna().query('starter == 1')['OUT'].mean()

In [None]:
# Define the fraction
fraction = actual_outs_mean/projected_outs_mean

# Select the columns you want to multiply and multiply them by the fraction
columns_to_multiply = ['OUT', 'ER', 'R', 'SO', 'FP']
pitchers_merged_df_scaled = pitchers_merged_df.copy()
pitchers_merged_df_scaled[columns_to_multiply] = pitchers_merged_df_scaled[columns_to_multiply] * fraction

In [None]:
# Describe the DataFrame
pitchers_scaled = pitchers_merged_df_scaled.dropna().query('starter == 1')[['OUT', 'outs', 'PA', 'pa', 'ER', 'er', 'R', 'r', 'SO', 'k', 'W', 'w', 'FP', 'fp']].agg(['mean', 'sum'])

# Split the dataframe into projected (even rows) and actual (odd rows)
projected = pitchers_scaled.T.iloc[::2].reset_index()
actual = pitchers_scaled.T.iloc[1::2].reset_index()

# Concatenate the two dataframes side-by-side
pitchers_scaled = pd.concat([projected, actual], axis=1)

# Rename the columns
pitchers_scaled.columns = ["Projected", "Projected Mean", 'Projected Sum', "Actual", "Actual Mean", 'Actual Sum']

pitchers_scaled

### Teams

In [None]:

scaled_pitchers = pitchers_merged_df.groupby(['gamePk', 'team']).sum(numeric_only=True).query('outs >= 24')[['OUT', 'outs', 'PA', 'pa', 'ER', 'er', 'R', 'r', 'SO', 'k', 'H', 'h', 'BB', 'bb', 'HR', 'hr', 'FP', 'fp']].agg(['mean'])

# Convert to DF
scaled_pitchers = pd.DataFrame(scaled_pitchers.mean().reset_index())

# Split the dataframe into projected (even rows) and actual (odd rows)
projected = scaled_pitchers.iloc[::2].reset_index(drop=True)
actual = scaled_pitchers.iloc[1::2].reset_index(drop=True)

# Concatenate the two dataframes side-by-side
scaled_pitchers = pd.concat([projected, actual], axis=1)

# Rename the columns
scaled_pitchers.columns = ["Projected", "Projected Value", "Actual", "Actual Value"]

scaled_pitchers

### Read in projections

##### DFF - Date-Based

In [None]:
# Set the directory path
directory = r'C:\Users\james\Documents\MLB\Database\A07. Projections\1. DFF\2. Projections\Date'

# Initialize an empty list to store dataframes
dfs = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):  # Check if the file is a CSV file
        # Read the CSV file into a pandas dataframe
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)  # Append the dataframe to the list

# Concatenate all dataframes into a single dataframe
dff_date_df = pd.concat(dfs, ignore_index=True)

# Drop duplicates based on "First Name", "Last Name", and "date"
dff_date_df.drop_duplicates(subset=["first_name", "last_name", "game_date"], inplace=True)

# Print the resulting dataframe
print(dff_date_df.shape)


##### DFF - Slate-Based

In [None]:
# Set the directory path
directory = r'C:\Users\james\Documents\MLB\Database\A07. Projections\1. DFF\2. Projections'

# Initialize an empty list to store dataframes
dfs = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):  # Check if the file is a CSV file
        # Read the CSV file into a pandas dataframe
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)  # Append the dataframe to the list

# Concatenate all dataframes into a single dataframe
dff_slate_df = pd.concat(dfs, ignore_index=True)

# Drop duplicates based on "First Name", "Last Name", and "date"
dff_slate_df.drop_duplicates(subset=["First Name", "Last Name", "date"], inplace=True)

# Print the resulting dataframe
print(dff_slate_df.shape)


In [None]:
dff_date_df.rename(columns={'first_name':'First Name', 'last_name':'Last Name', 'ppg_projection':'FP', 'team':'Team'}, inplace=True)
dff_date_df['date'] = dff_date_df['game_date'].str.replace("-", "").astype('int')

In [None]:
dff_df = pd.concat([dff_date_df[['First Name', 'Last Name', 'Team', 'FP', 'date']], dff_slate_df[['First Name', 'Last Name', 'FP', 'Team', 'date']]], axis=0)

In [None]:
dff_df['fullName'] = dff_df['First Name'] + " " + dff_df['Last Name']
dff_df.rename(columns={'FP': 'FP_DFF'}, inplace=True)

In [None]:
dff_df.drop_duplicates(['fullName', 'date'], inplace=True)

##### Batters

In [None]:
batters_with_dff = batters_merged_df.drop_duplicates(['fullName', 'date']).merge(dff_df, on=['fullName', 'date'], how='inner', validate='one_to_one')

In [None]:
batters_with_dff['error_me'] = (batters_with_dff['fp'] - batters_with_dff['FP'])
batters_with_dff['error_dff'] = (batters_with_dff['fp'] - batters_with_dff['FP_DFF'])

batters_with_dff['error_me2'] = batters_with_dff['error_me'] ** 2
batters_with_dff['error_dff2'] = batters_with_dff['error_dff'] ** 2

batters_with_dff['beat_dff'] = (batters_with_dff['error_me2'] < batters_with_dff['error_dff2']).astype('int')

##### All

In [None]:
batters_with_dff.query('FP > 5 and FP_DFF > 5')[['FP', 'FP_DFF', 'fp', 'error_me', 'error_dff', 'error_me2', 'error_dff2', 'beat_dff']].describe()

##### Winsorized

In [None]:
# Winsorize the DataFrame
batters_winsorized = batters_with_dff.query('FP > 5 and FP_DFF > 5')[['FP', 'FP_DFF', 'fp', 'error_me', 'error_dff', 'error_me2', 'error_dff2', 'beat_dff']].apply(lambda x: winsorize(x, limits=[0.05, 0.05]))

# Describe the winsorized DataFrame
batters_winsorized.describe()

##### Pitchers

In [None]:
pitchers_with_dff = pitchers_merged_df.drop_duplicates(['fullName', 'date']).merge(dff_df, on=['fullName', 'date'], how='inner')

In [None]:
pitchers_with_dff['error_me'] = (pitchers_with_dff['fp'] - pitchers_with_dff['FP'])
pitchers_with_dff['error_dff'] = (pitchers_with_dff['fp'] - pitchers_with_dff['FP_DFF'])

pitchers_with_dff['error_me2'] = pitchers_with_dff['error_me'] ** 2
pitchers_with_dff['error_dff2'] = pitchers_with_dff['error_dff'] ** 2

pitchers_with_dff['beat_dff'] = (pitchers_with_dff['error_me2'] < pitchers_with_dff['error_dff2']).astype('int')

##### All

In [None]:
pitchers_with_dff.dropna().query('starter == 1')[['FP', 'FP_DFF', 'fp', 'error_me', 'error_dff', 'error_me2', 'error_dff2', 'beat_dff']].describe()

In [None]:
pitchers_with_dff.query('FP > 10').dropna().query('starter == 1')[['FP', 'FP_DFF', 'fp', 'error_me', 'error_dff', 'error_me2', 'error_dff2', 'beat_dff']].describe()

In [None]:
import numpy as np

df = pitchers_with_dff.query('starter == 1').dropna()

# Bucketing the data based on intervals of 0.5 for FP
bucket_size = 2
FP_bucketed = np.floor(df['FP'] / bucket_size) * bucket_size

# Calculating the average fp for each bucket
grouped_data = df.groupby(FP_bucketed)['fp'].mean()

# Getting the center of each bucket
bucket_centers = (grouped_data.index + bucket_size / 2)

# Creating scatter plot
plt.figure(figsize=(8, 8))
plt.scatter(bucket_centers, grouped_data, color='blue')
plt.title('Average fp vs FP (Bucketed)')
plt.xlabel('FP')
plt.ylabel('Average fp')
plt.grid(True)

# Set the same intervals on each side from -10 to 60
plt.xlim(-10, 60)
plt.ylim(-10, 60)

plt.show()


##### Winsorized

In [None]:
# Winsorize the DataFrame
pitchers_winsorized = pitchers_with_dff.query('starter == 1')[['FP', 'FP_DFF', 'fp', 'error_me', 'error_dff', 'error_me2', 'error_dff2', 'beat_dff']].apply(lambda x: winsorize(x, limits=[0.05, 0.05]))

# Describe the winsorized DataFrame
pitchers_winsorized.describe()

In [None]:
BRERASLK 

In [None]:
## Testing

In [None]:
test = batters_merged_df[~batters_merged_df['FP'].isna()][['fullName', 'imp_b_l', 'imp_b_r', 'FP', 'fp']]
test['overproject'] = (test['FP'] > test['fp']).astype(int)
test.describe()

In [None]:
test.groupby('imp_b_l')['overproject'].mean()

In [None]:
test.groupby('imp_b_r')['overproject'].mean()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Sample DataFrame
# test = pd.DataFrame(...)

# Create buckets of 0.1 for FP
test['FP_bucket'] = np.floor(test['FP'] / 0.2) * 0.2

# Group by the buckets and calculate the mean of fp for each bucket
grouped = test.groupby('FP_bucket')['fp'].mean().reset_index()

# Scatter plot
plt.figure(figsize=(8, 8))  # Make the plot square
plt.scatter(grouped['FP_bucket'], grouped['fp'], label='Data points')
plt.xlabel('FP (bucketed)')
plt.ylabel('Average fp')
plt.title('Average fp for each 0.1 FP bucket')

# Set the aspect ratio to be equal
plt.gca().set_aspect('equal', adjustable='box')

# Ensure the limits of the axes are the same
min_val = min(grouped['FP_bucket'].min(), grouped['fp'].min())
max_val = max(grouped['FP_bucket'].max(), grouped['fp'].max())
plt.xlim(min_val, max_val)
plt.ylim(min_val, max_val)

# Plot the 45-degree line
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='45-degree line')

plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import linregress

# Sample DataFrame
# test = pd.DataFrame(...)

# Perform OLS regression with intercept set to 0
slope, intercept, r_value, p_value, std_err = linregress(test['FP'], test['fp'])

# Since we want the intercept to be 0, we calculate the slope directly without using linregress
slope = np.sum(test['FP'] * test['fp']) / np.sum(test['FP']**2)

# Calculate R-squared
r_squared = r_value**2

# Print the summary
print("OLS Regression Summary (with intercept = 0):")
print(f"Slope: {slope:.4f}")
print(f"R-squared: {r_squared:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Standard Error: {std_err:.4f}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Sample DataFrame
# Assuming you have a DataFrame named 'df' with columns 'FP' and 'overproject'

# Create buckets of 10 for FP
test['FP_bucket'] = np.floor(test['FP'] / 1) * 1

# Group by the buckets and calculate the mean of overproject for each bucket
grouped = test.groupby('FP_bucket')['overproject'].mean().reset_index()

# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(grouped['FP_bucket'], grouped['overproject'])
plt.xlabel('FP (bucketed)')
plt.ylabel('Average overproject')
plt.title('Average overproject as FP increases')

plt.grid(True)
plt.show()


In [None]:
pitcher_test = pitchers_merged_df[~pitchers_merged_df['FP'].isna()].query('FP > 8')[['fullName', 'imp_p_l', 'imp_p_r', 'FP', 'fp']]
pitcher_test['overproject'] = (pitcher_test['FP'] > pitcher_test['fp']).astype(int)
pitcher_test.describe()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Sample DataFrame
# Assuming you have a DataFrame named 'df' with columns 'FP' and 'overproject'

# Create buckets of 10 for FP
pitcher_test['FP_bucket'] = np.floor(pitcher_test['FP'] / 2) * 2

# Group by the buckets and calculate the mean of overproject for each bucket
grouped = pitcher_test.groupby('FP_bucket')['overproject'].mean().reset_index()

# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(grouped['FP_bucket'], grouped['overproject'])
plt.xlabel('FP (bucketed)')
plt.ylabel('Average overproject')
plt.title('Average overproject as FP increases')

plt.grid(True)
plt.show()
