In [60]:
import importlib
import myutils
importlib.reload(myutils)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd

In [49]:
df = pd.read_csv("csv_files/fangraphs_leaderboard_24.csv")
df["vFA (pi)"].fillna(df["vFA (pi)"].mean(), inplace=True)
print(df["vFA (pi)"].median())

93.88337371552967


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["vFA (pi)"].fillna(df["vFA (pi)"].mean(), inplace=True)


In [25]:
df2 = pd.read_csv("fangraphs-leaderboards 2008.csv")
df2["vFA (pi)"].fillna(df2["vFA (pi)"].mean(), inplace=True)
print(df2["vFA (pi)"].median())

90.63017002854056


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2["vFA (pi)"].fillna(df2["vFA (pi)"].mean(), inplace=True)


In [26]:
df3 = pd.read_csv("2008-velo.csv")
df3["FB%"].fillna(df3["FB%"].mean(), inplace=True)
print(df3["FB%"].median())

0.596683725


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df3["FB%"].fillna(df3["FB%"].mean(), inplace=True)


In [30]:
df4 = pd.read_csv("csv_files/fangraphs_leaderboard_24.csv")
df4["FBv"].fillna(df4["FBv"].mean(), inplace=True)
print(df4["FBv"].median())

93.72347873999999


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df4["FBv"].fillna(df4["FBv"].mean(), inplace=True)


In [31]:
# get rid of same named columns 
df_clean = df[["Name"]]
df = df.drop(columns=['Team', 'NameASCII', 'PlayerId', 'MLBAMID'], errors="ignore")
df4["Name"] = df4["Name"].astype(str)
df["Name"] = df["Name"].astype(str)
new_df = df4.join(df_clean.set_index("Name"), on="Name", how='left')
print(new_df)
len(df4)

                  Name   Team              Name.1 Team.1  Season  Age   W   L  \
0           Aaron Nola    PHI          Aaron Nola    PHI    2024   31  14   8   
1        Austin Gomber    COL       Austin Gomber    COL    2024   30   5  12   
2          Bailey Ober    MIN         Bailey Ober    MIN    2024   28  12   9   
3         Brady Singer    KCR        Brady Singer    KCR    2024   27   9  13   
4       Brandon Pfaadt    ARI      Brandon Pfaadt    ARI    2024   25  11  10   
5         Brayan Bello    BOS        Brayan Bello    BOS    2024   25  14   8   
6         Bryce Miller    SEA        Bryce Miller    SEA    2024   25  12   8   
7         Carlos Rodón    NYY        Carlos Rodón    NYY    2024   31  16   9   
8       Charlie Morton    ATL      Charlie Morton    ATL    2024   40   8  10   
9        Chris Bassitt    TOR       Chris Bassitt    TOR    2024   35  10  14   
10          Chris Sale    ATL          Chris Sale    ATL    2024   35  18   3   
11         Cole Ragans    KC

58

In [65]:
injury_page = pd.read_csv("csv_files/injury_report_24.csv")
pitch_breakdown = pd.read_csv("csv_files/fangraphs_leaderboard_all_pitchers.csv")

counter = len(injury_page[injury_page["Pos"] == "SP"])
print(counter)

229


855 players pitched during the 2024 season. 229 starting pitchers were listed as injured during the 2024 season. 199 of them played in a game. 

In [45]:
injured_pitchers = injury_page[injury_page["Pos"] == "SP"]
injured_pitchers_with_age = pd.merge(injured_pitchers, pitch_breakdown, on="Name", how="inner")

Here is the breakdown by age:

In [38]:
age_counts = injured_pitchers_with_age["Age"].value_counts().sort_index()  # Sort by age
print(age_counts)

Age
21     1
22     4
23     6
24    15
25    19
26    30
27    12
28    17
29    16
30    12
31    22
32    13
33     6
34     5
35     2
36     7
37     8
39     3
41     2
Name: count, dtype: int64


In [47]:
injured_pitchers_with_age["vFA (pi)"].fillna(injured_pitchers_with_age["vFA (pi)"].mean(), inplace=True)
print(injured_pitchers_with_age["vFA (pi)"].median())

print(injured_pitchers_with_age["IP"].median()/injured_pitchers_with_age["G"].median())

93.79889189889607
4.5058823529411764


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  injured_pitchers_with_age["vFA (pi)"].fillna(injured_pitchers_with_age["vFA (pi)"].mean(), inplace=True)


In [None]:
merged = pd.merge(
    pitch_breakdown,
    injured_pitchers,
    on="Name",        
    how="left",       # keep all pitchers
    indicator=True    # adds a "_merge" column
)

# add classification of 1 if injured, 0 if not
pitch_breakdown["injury_label"] = merged["_merge"].apply(lambda x: 1 if x == "both" else 0)


855


In [69]:
features = [
    'Age',
    'IP',
    'vFA (pi)',
    'FB%',
    'SL%',
    'CH%',
]

X = pitch_breakdown[features]
y = pitch_breakdown["injury_label"] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('model', LogisticRegression(max_iter=1000))  # use more iterations if it doesn't converge
])

# Fit the pipeline
pipeline.fit(X_train, y_train)
