In [None]:
import pandas as pd
import pyarrow.dataset as pads
from SMT_data_starter import readDataSubset

## Getting season metrics for all pitchers

In [None]:
game_info_subset = readDataSubset('game_info', '2024_SMT_Data_Challenge')
game_info_1A = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home1A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000)).to_pandas().dropna(subset=['pitcher'])
game_info_2A = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home2A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000)).to_pandas().dropna(subset=['pitcher'])
game_info_3A = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home3A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000)).to_pandas().dropna(subset=['pitcher'])
game_info_4A = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home4A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000)).to_pandas().dropna(subset=['pitcher'])

In [None]:
game_events_subset = readDataSubset('game_events', '2024_SMT_Data_Challenge')
game_events_1A = game_events_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('HomeTeam') == 'Home1A')).to_pandas()
game_events_2A = game_events_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('HomeTeam') == 'Home2A')).to_pandas()
game_events_3A = game_events_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('HomeTeam') == 'Home3A')).to_pandas()
game_events_4A = game_events_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('HomeTeam') == 'Home4A')).to_pandas()

In [None]:
from functions import calculate_pace, calculate_rest_days, get_pitching_metrics, create_pitching_metrics_df, create_cumulative_pitching_metrics_df, get_season_metrics_df, apply_fatigue_unit_equation, apply_muscle_fatigue_equation

In [None]:
pitchers_list_1A = game_info_1A['pitcher'].unique().tolist()
pitchers_list_2A = game_info_2A['pitcher'].unique().tolist()
pitchers_list_3A = game_info_3A['pitcher'].unique().tolist()
pitchers_list_4A = game_info_4A['pitcher'].unique().tolist()

In [None]:
hits_list_1A, walks_list_1A, strikeout_list_1A, batters_faced_list_1A, innings_pitched_list_1A, is_starter_list_1A = get_pitching_metrics(game_info_1A, '1A')
hits_list_2A, walks_list_2A, strikeout_list_2A, batters_faced_list_2A, innings_pitched_list_2A, is_starter_list_2A = get_pitching_metrics(game_info_2A, '2A')
hits_list_3A, walks_list_3A, strikeout_list_3A, batters_faced_list_3A, innings_pitched_list_3A, is_starter_list_3A = get_pitching_metrics(game_info_3A, '3A')
hits_list_4A, walks_list_4A, strikeout_list_4A, batters_faced_list_4A, innings_pitched_list_4A, is_starter_list_4A = get_pitching_metrics(game_info_4A, '4A')

In [None]:
pitching_metrics_1A = create_pitching_metrics_df(game_info_1A, hits_list_1A, walks_list_1A, strikeout_list_1A, batters_faced_list_1A, innings_pitched_list_1A, is_starter_list_1A)
pitching_metrics_2A = create_pitching_metrics_df(game_info_2A, hits_list_2A, walks_list_2A, strikeout_list_2A, batters_faced_list_2A, innings_pitched_list_2A, is_starter_list_2A)
pitching_metrics_3A = create_pitching_metrics_df(game_info_3A, hits_list_3A, walks_list_3A, strikeout_list_3A, batters_faced_list_3A, innings_pitched_list_3A, is_starter_list_3A)
pitching_metrics_4A = create_pitching_metrics_df(game_info_4A, hits_list_4A, walks_list_4A, strikeout_list_4A, batters_faced_list_4A, innings_pitched_list_4A, is_starter_list_4A)

In [None]:
total_pitches_list_1A, pace_list_1A = calculate_pace(game_info_1A, '1A')
total_pitches_list_2A, pace_list_2A = calculate_pace(game_info_2A, '2A')
total_pitches_list_3A, pace_list_3A = calculate_pace(game_info_3A, '3A')
total_pitches_list_4A, pace_list_4A = calculate_pace(game_info_4A, '4A')

In [None]:
pitching_metrics_1A['total_pitches'] = total_pitches_list_1A
pitching_metrics_1A['pace'] = pace_list_1A
pitching_metrics_2A['total_pitches'] = total_pitches_list_2A
pitching_metrics_2A['pace'] = pace_list_2A
pitching_metrics_3A['total_pitches'] = total_pitches_list_3A
pitching_metrics_3A['pace'] = pace_list_3A
pitching_metrics_4A['total_pitches'] = total_pitches_list_4A
pitching_metrics_4A['pace'] = pace_list_4A

In [None]:
pitching_metrics_1A['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_1A)
pitching_metrics_2A['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_2A)
pitching_metrics_3A['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_3A)
pitching_metrics_4A['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_4A)

In [None]:
cumulative_metrics_1A = create_cumulative_pitching_metrics_df(pitching_metrics_1A).drop(columns=['pace'])
cumulative_metrics_2A = create_cumulative_pitching_metrics_df(pitching_metrics_2A).drop(columns=['pace'])
cumulative_metrics_3A = create_cumulative_pitching_metrics_df(pitching_metrics_3A).drop(columns=['pace'])
cumulative_metrics_4A = create_cumulative_pitching_metrics_df(pitching_metrics_4A).drop(columns=['pace'])

In [None]:
cumulative_metrics_1A['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_1A)
cumulative_metrics_2A['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_2A)
cumulative_metrics_3A['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_3A)
cumulative_metrics_4A['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_4A)

muscle_fatigue_1A_cumsum = pitching_metrics_1A.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()
muscle_fatigue_2A_cumsum = pitching_metrics_2A.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()
muscle_fatigue_3A_cumsum = pitching_metrics_3A.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()
muscle_fatigue_4A_cumsum = pitching_metrics_4A.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()

cumulative_metrics_1A['average_muscle_fatigue'] = muscle_fatigue_1A_cumsum / (cumulative_metrics_1A['games_played'])
cumulative_metrics_2A['average_muscle_fatigue'] = muscle_fatigue_2A_cumsum / (cumulative_metrics_2A['games_played'])
cumulative_metrics_3A['average_muscle_fatigue'] = muscle_fatigue_3A_cumsum / (cumulative_metrics_3A['games_played'])
cumulative_metrics_4A['average_muscle_fatigue'] = muscle_fatigue_4A_cumsum / (cumulative_metrics_4A['games_played'])

In [None]:
cumulative_metrics_1A = cumulative_metrics_1A.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)
cumulative_metrics_2A = cumulative_metrics_2A.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)
cumulative_metrics_3A = cumulative_metrics_3A.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)
cumulative_metrics_4A = cumulative_metrics_4A.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)

In [None]:
cumulative_metrics_1A['rest_days'] = calculate_rest_days(cumulative_metrics_1A)
cumulative_metrics_2A['rest_days'] = calculate_rest_days(cumulative_metrics_2A)
cumulative_metrics_3A['rest_days'] = calculate_rest_days(cumulative_metrics_3A)
cumulative_metrics_4A['rest_days'] = calculate_rest_days(cumulative_metrics_4A)

In [None]:
season_df = pd.concat([get_season_metrics_df(cumulative_metrics_1A), get_season_metrics_df(cumulative_metrics_2A), get_season_metrics_df(cumulative_metrics_3A), get_season_metrics_df(cumulative_metrics_4A)]).drop(columns=['rest_days'])
season_df.rename(columns={'game_day': 'last_game_day_played'}, inplace=True)
season_df.reset_index(drop=True)

In [None]:
season_df[['average_rest_days', 'fatigue_units', 'average_muscle_fatigue', 'WHIP', 'K/IP', 'innings_pitched', 'total_pitches', 'games_played']] = season_df[['average_rest_days', 'fatigue_units', 'average_muscle_fatigue', 'WHIP', 'K/IP', 'innings_pitched', 'total_pitches', 'games_played']].astype(float)

In [None]:
season_df.to_csv('season_metrics.csv', index=False)

## Getting metrics for pitchers who played in extra-innings

In [None]:
game_info_1A_extra = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home1A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000) & (pads.field('inning') > 9)).to_pandas().dropna(subset=['pitcher'])
game_info_2A_extra = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home2A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000) & (pads.field('inning') > 9)).to_pandas().dropna(subset=['pitcher'])
game_info_3A_extra = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home3A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000) & (pads.field('inning') > 9)).to_pandas().dropna(subset=['pitcher'])
game_info_4A_extra = game_info_subset.to_table(filter = (pads.field('Season') == 'Season_1884') & (pads.field('home_team') == 'Home4A') & (pads.field('top_bottom') == 'top') & (pads.field('pitcher') < 1000) & (pads.field('inning') > 9)).to_pandas().dropna(subset=['pitcher'])

In [None]:
hits_list_1A_extra, walks_list_1A_extra, strikeout_list_1A_extra, batters_faced_list_1A_extra, innings_pitched_list_1A_extra, is_starter_list_1A_extra = get_pitching_metrics(game_info_1A_extra, '1A')
hits_list_2A_extra, walks_list_2A_extra, strikeout_list_2A_extra, batters_faced_list_2A_extra, innings_pitched_list_2A_extra, is_starter_list_2A_extra = get_pitching_metrics(game_info_2A_extra, '2A')
hits_list_3A_extra, walks_list_3A_extra, strikeout_list_3A_extra, batters_faced_list_3A_extra, innings_pitched_list_3A_extra, is_starter_list_3A_extra = get_pitching_metrics(game_info_3A_extra, '3A')
hits_list_4A_extra, walks_list_4A_extra, strikeout_list_4A_extra, batters_faced_list_4A_extra, innings_pitched_list_4A_extra, is_starter_list_4A_extra = get_pitching_metrics(game_info_4A_extra, '4A')

In [None]:
pitching_metrics_1A_extra = create_pitching_metrics_df(game_info_1A_extra, hits_list_1A_extra, walks_list_1A_extra, strikeout_list_1A_extra, batters_faced_list_1A_extra, innings_pitched_list_1A_extra, is_starter_list_1A_extra)
pitching_metrics_2A_extra = create_pitching_metrics_df(game_info_2A_extra, hits_list_2A_extra, walks_list_2A_extra, strikeout_list_2A_extra, batters_faced_list_2A_extra, innings_pitched_list_2A_extra, is_starter_list_2A_extra)
pitching_metrics_3A_extra = create_pitching_metrics_df(game_info_3A_extra, hits_list_3A_extra, walks_list_3A_extra, strikeout_list_3A_extra, batters_faced_list_3A_extra, innings_pitched_list_3A_extra, is_starter_list_3A_extra)
pitching_metrics_4A_extra = create_pitching_metrics_df(game_info_4A_extra, hits_list_4A_extra, walks_list_4A_extra, strikeout_list_4A_extra, batters_faced_list_4A_extra, innings_pitched_list_4A_extra, is_starter_list_4A_extra)

In [None]:
total_pitches_list_1A_extra, pace_list_1A_extra = calculate_pace(game_info_1A_extra, '1A')
total_pitches_list_2A_extra, pace_list_2A_extra = calculate_pace(game_info_2A_extra, '2A')
total_pitches_list_3A_extra, pace_list_3A_extra = calculate_pace(game_info_3A_extra, '3A')
total_pitches_list_4A_extra, pace_list_4A_extra = calculate_pace(game_info_4A_extra, '4A')

In [None]:
pitching_metrics_1A_extra['total_pitches'] = total_pitches_list_1A_extra
pitching_metrics_1A_extra['pace'] = pace_list_1A_extra
pitching_metrics_2A_extra['total_pitches'] = total_pitches_list_2A_extra
pitching_metrics_2A_extra['pace'] = pace_list_2A_extra
pitching_metrics_3A_extra['total_pitches'] = total_pitches_list_3A_extra
pitching_metrics_3A_extra['pace'] = pace_list_3A_extra
pitching_metrics_4A_extra['total_pitches'] = total_pitches_list_4A_extra
pitching_metrics_4A_extra['pace'] = pace_list_4A_extra

In [None]:
pitching_metrics_1A_extra['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_1A_extra)
pitching_metrics_2A_extra['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_2A_extra)
pitching_metrics_3A_extra['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_3A_extra)
pitching_metrics_4A_extra['muscle_fatigue'] = apply_muscle_fatigue_equation(pitching_metrics_4A_extra)

In [None]:
cumulative_metrics_1A_extra = create_cumulative_pitching_metrics_df(pitching_metrics_1A_extra).drop(columns=['pace'])
cumulative_metrics_2A_extra = create_cumulative_pitching_metrics_df(pitching_metrics_2A_extra).drop(columns=['pace'])
cumulative_metrics_3A_extra = create_cumulative_pitching_metrics_df(pitching_metrics_3A_extra).drop(columns=['pace'])
cumulative_metrics_4A_extra = create_cumulative_pitching_metrics_df(pitching_metrics_4A_extra).drop(columns=['pace'])

In [None]:
cumulative_metrics_1A_extra['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_1A_extra)
cumulative_metrics_2A_extra['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_2A_extra)
cumulative_metrics_3A_extra['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_3A_extra)
cumulative_metrics_4A_extra['fatigue_units'] = apply_fatigue_unit_equation(cumulative_metrics_4A_extra)

muscle_fatigue_1A_extra_cumsum = pitching_metrics_1A_extra.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()
muscle_fatigue_2A_extra_cumsum = pitching_metrics_2A_extra.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()
muscle_fatigue_3A_extra_cumsum = pitching_metrics_3A_extra.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()
muscle_fatigue_4A_extra_cumsum = pitching_metrics_4A_extra.groupby('pitcher').agg({'muscle_fatigue': 'cumsum'})['muscle_fatigue'].tolist()

cumulative_metrics_1A_extra['average_muscle_fatigue'] = muscle_fatigue_1A_extra_cumsum / (cumulative_metrics_1A_extra['games_played'])
cumulative_metrics_2A_extra['average_muscle_fatigue'] = muscle_fatigue_2A_extra_cumsum / (cumulative_metrics_2A_extra['games_played'])
cumulative_metrics_3A_extra['average_muscle_fatigue'] = muscle_fatigue_3A_extra_cumsum / (cumulative_metrics_3A_extra['games_played'])
cumulative_metrics_4A_extra['average_muscle_fatigue'] = muscle_fatigue_4A_extra_cumsum / (cumulative_metrics_4A_extra['games_played'])

In [None]:
cumulative_metrics_1A_extra = cumulative_metrics_1A_extra.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)
cumulative_metrics_2A_extra = cumulative_metrics_2A_extra.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)
cumulative_metrics_3A_extra = cumulative_metrics_3A_extra.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)
cumulative_metrics_4A_extra = cumulative_metrics_4A_extra.groupby('pitcher').apply(pd.DataFrame).reset_index(drop=True)

In [None]:
cumulative_metrics_1A_extra['rest_days'] = calculate_rest_days(cumulative_metrics_1A_extra)
cumulative_metrics_2A_extra['rest_days'] = calculate_rest_days(cumulative_metrics_2A_extra)
cumulative_metrics_3A_extra['rest_days'] = calculate_rest_days(cumulative_metrics_3A_extra)
cumulative_metrics_4A_extra['rest_days'] = calculate_rest_days(cumulative_metrics_4A_extra)

In [None]:
season_df_extra = pd.concat([get_season_metrics_df(cumulative_metrics_1A_extra), get_season_metrics_df(cumulative_metrics_2A_extra), get_season_metrics_df(cumulative_metrics_3A_extra), get_season_metrics_df(cumulative_metrics_4A_extra)]).drop(columns=['rest_days'])
season_df_extra.rename(columns={'game_day': 'last_game_day_played'}, inplace=True)
season_df_extra.reset_index(drop=True)

In [None]:
season_df_extra.to_csv('extra_inning_metrics.csv', index=False)

## Cluster Analysis

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(season_df[['level', 'average_rest_days', 'fatigue_units', 'average_muscle_fatigue', 'games_played', 'innings_pitched', 'total_pitches', 'WHIP', 'K/IP']])
    sse.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared errors')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

In [None]:
clusters = 4

kmeans = KMeans(n_clusters=clusters, random_state=42, n_init=10)
season_df['cluster'] = kmeans.fit_predict(season_df[['level', 'average_rest_days', 'fatigue_units', 'average_muscle_fatigue', 'games_played', 'innings_pitched', 'total_pitches', 'WHIP', 'K/IP']])

sil_score = silhouette_score(season_df[['level', 'average_rest_days', 'fatigue_units', 'average_muscle_fatigue', 'games_played', 'innings_pitched', 'total_pitches', 'WHIP', 'K/IP']], season_df['cluster'])
print(f'Silhouette Score: {sil_score}')

In [None]:
cluster_summary = season_df.groupby('cluster').mean().reset_index()
cluster_summary.drop(columns=['pitcher'], inplace=True)

In [None]:
cluster_summary.to_csv('cluster_summary.csv', index=False)

In [None]:
cluster_list = []
pitchers_list = season_df_extra['pitcher'].tolist()
level_list = season_df_extra['level'].tolist()
for i in range(len(season_df_extra)):
    pitcher_id = pitchers_list[i]
    level = level_list[i]
    cluster = season_df[(season_df['pitcher'] == pitcher_id) & (season_df['level'] == level)]['cluster'].values[0]
    cluster_list.append(cluster)
season_df_extra['cluster'] = cluster_list

In [None]:
cluster_summary_extra_df = season_df_extra.drop(columns=['pitcher']).groupby('cluster').mean().reset_index()

In [None]:
cluster_summary_extra_df.to_csv('cluster_summary_extra_inning.csv', index=False)

## Predictive Modelling

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

In [None]:
features = ['average_rest_days', 'fatigue_units', 'average_muscle_fatigue', 'games_played', 'total_pitches']
X = season_df[features]
y = season_df[['cluster']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
# XGB Classifier
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, np.ravel(y_train))

y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, np.ravel(y_train))

y_pred = gnb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
joblib.dump(rf, 'pitcher_classifier_model.joblib')

## Visualization

In [None]:
categories = ['1', '2', '3', '4']  
plt.figure(figsize=(20, 6))

# Plot confusion matrix
plt.subplot(1, 2, 1)
plt.imshow(cm, interpolation='nearest', cmap='coolwarm')
plt.title('Confusion Matrix for Random Forest Classifier')
plt.colorbar()

tick_marks = np.arange(len(categories))
plt.xticks(tick_marks, categories)
plt.yticks(tick_marks, categories)

thresh = cm.max() / 3
for i in range(len(categories)):
    for j in range(len(categories)):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if (cm[i, j] > (cm.max() - thresh) or cm[i, j] < thresh) else "black")
plt.xlabel('Predicted Clusters')
plt.ylabel('Actual Clusters')

# Plot feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)

plt.subplot(1,2,2)
plt.title("Feature Importances")
plt.barh(range(X.shape[1]), importances[indices], color="blue", align="center")
plt.yticks(range(X.shape[1]), np.array(features)[indices])
plt.xlabel('Importance')
plt.gca().invert_yaxis()
plt.gca().invert_yaxis()

plt.show()

In [None]:
cluster_summary['cluster'] = cluster_summary['cluster'].map({0: 1, 1: 2, 2: 3, 3: 4})
season_df['cluster'] = season_df['cluster'].map({0: 1, 1: 2, 2: 3, 3: 4})
cluster_summary_extra_df['cluster'] = cluster_summary_extra_df['cluster'].map({0: 1, 1: 2, 2: 3, 3: 4})

In [None]:
pastel_colors = sns.color_palette("pastel")

In [None]:
# Scatter plot using Seaborn
plt.figure(figsize=(12, 10))

plt.subplot(2, 2, 1)
sns.scatterplot(data=season_df, x='fatigue_units', y='average_muscle_fatigue', hue='cluster', palette='pastel', s=100)
plt.title('Fatigue Units vs Muscle Fatigue')
plt.xlabel('fatigue_units')
plt.ylabel('average_muscle_fatigue')
plt.legend(title='Cluster')
plt.grid(True)

plt.subplot(2, 2, 2)
sns.scatterplot(data=season_df, x='games_played', y='total_pitches', hue='cluster', palette='pastel', s=100)
plt.title('Games Played vs Total Pitches')
plt.xlabel('games_played')
plt.ylabel('total_pitches')
plt.legend(title='Cluster')
plt.grid(True)

plt.subplot(2, 2, 3)
sns.scatterplot(data=season_df, x='fatigue_units', y='average_rest_days', hue='cluster', palette='pastel', s=100)
plt.title('Fatigue Units vs Average Rest Days')
plt.xlabel('fatigue_units')
plt.ylabel('average_rest_days')
plt.legend(title='Cluster')
plt.grid(True)

plt.subplot(2, 2, 4)
sns.scatterplot(data=season_df, x='K/IP', y='WHIP', hue='cluster', palette='pastel', s=100)
plt.title('K/IP vs WHIP')
plt.xlabel('K/IP')
plt.ylabel('WHIP')
plt.legend(title='Cluster')
plt.grid(True)



plt.show()

In [None]:
# Plotting the average performance metrics for each cluster
plt.figure(figsize=(12, 8))

# WHIP by cluster
plt.subplot(2, 2, 1)
sns.barplot(x=cluster_summary_extra_df['cluster'], y=cluster_summary_extra_df['WHIP'], palette='pastel')
plt.title('Average WHIP in Extra-innings')
plt.xlabel('Cluster')
plt.ylabel('WHIP')

# Strikeouts per inning by cluster
plt.subplot(2, 2, 2)
sns.barplot(x=cluster_summary_extra_df['cluster'], y=cluster_summary_extra_df['K/IP'], palette='pastel')
plt.title('Average K/IP in Extra-innings')
plt.xlabel('Cluster')
plt.ylabel('K/IP')

# Total pitches by cluster
plt.subplot(2, 2, 3)
sns.barplot(x=cluster_summary_extra_df['cluster'], y=cluster_summary_extra_df['total_pitches'], palette='pastel')
plt.title('Average Number of Pitches in Extra-innings')
plt.xlabel('Cluster')
plt.ylabel('Pitches')

# Muscle fatigue by cluster
plt.subplot(2, 2, 4)
sns.barplot(x=cluster_summary_extra_df['cluster'], y=cluster_summary_extra_df['average_muscle_fatigue'], palette='pastel')
plt.title('Average Muscle Fatigue in Extra-innings')
plt.xlabel('Cluster')
plt.ylabel('Muscle Fatigue')

plt.tight_layout()
plt.show()

In [None]:
# Plotting the average performance metrics for each cluster
plt.figure(figsize=(12, 8))

# WHIP by cluster
plt.subplot(2, 2, 1)
sns.barplot(x=cluster_summary['cluster'], y=cluster_summary['WHIP'], palette='pastel')
plt.title('Average WHIP')
plt.xlabel('Cluster')
plt.ylabel('WHIP')

# Strikeouts per inning by cluster
plt.subplot(2, 2, 2)
sns.barplot(x=cluster_summary['cluster'], y=cluster_summary['K/IP'], palette='pastel')
plt.title('Average K/IP')
plt.xlabel('Cluster')
plt.ylabel('K/IP')

# Games started vs relieved by cluster
melted_df = cluster_summary.melt(id_vars=['cluster'], 
                    value_vars=['games_started', 'games_relieved'],
                    var_name='games', value_name='value')
plt.subplot(2, 2, 3)
sns.barplot(x='cluster', y='value', hue='games', data=melted_df, palette='pastel')
plt.title('Average Games Started vs Relieved')
plt.xlabel('Cluster')
plt.ylabel('Games')

# Innings pitched by cluster
plt.subplot(2, 2, 4)
sns.barplot(x=cluster_summary['cluster'], y=cluster_summary['innings_pitched'], palette='pastel')
plt.title('Average Number of Innings Pitched')
plt.xlabel('Cluster')
plt.ylabel('Innings Pitched')

plt.tight_layout()
plt.show()

In [None]:
# Plotting the average performance metrics for each cluster
plt.figure(figsize=(12, 4))

# Fatigue units by cluster
plt.subplot(1, 2, 1)
sns.barplot(x=cluster_summary['cluster'], y=cluster_summary['fatigue_units'], palette='pastel')
plt.title('Average Fatigue Units')
plt.xlabel('Cluster')
plt.ylabel('Fatigue units')

# Muscle fatigue by cluster
plt.subplot(1, 2, 2)
sns.barplot(x=cluster_summary['cluster'], y=cluster_summary['average_muscle_fatigue'], palette='pastel')
plt.title('Average Mudcle Fatigue')
plt.xlabel('Cluster')
plt.ylabel('Muscle fatigue')

plt.tight_layout()
plt.show()

In [None]:
cluster_counts = season_df.groupby(['level', 'cluster']).size().reset_index(name='counts')
cluster_pivot = cluster_counts.pivot(index='level', columns='cluster', values='counts')
cluster_pivot.plot(kind='bar', stacked=True, figsize=(10, 6), color=pastel_colors)
plt.title('Number of Pitchers in Each Cluster by Level')
plt.xticks(rotation=0)
plt.xlabel('Level')
plt.ylabel('Number of Pitchers')
plt.legend(title='Cluster', loc='upper center')
plt.show()

## Results

In [None]:
season_df['roles'] = season_df['cluster'].apply(lambda x: 'Versatile Reliever' if x == 1 else 'Middle Reliever' if x == 2 else 'Starting Pitcher' if x == 3 else 'Closer')

In [None]:
season_df[['level', 'pitcher', 'roles']].to_csv('pitcher_roles.csv', index=False)