In [1]:
# Import dependencies
import pandas as pd
from unidecode import unidecode
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [2]:
# Read in the CSV file as a Pandas DataFrame
pitching_df = pd.read_csv('Pitching_data.csv')
pitching_df['Name'] = pitching_df['Name'].apply(lambda x: unidecode(str(x)))
pitching_df.head()

Unnamed: 0,Rk,Name,Age,Tm,Lg,W,L,W-L%,ERA,G,...,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W
0,1,Cory Abbott,26.0,WSN,NL,0,5,0.0,5.25,16,...,2,216,74.0,6.36,1.438,8.3,2.3,4.7,8.4,1.8
1,2,Albert Abreu,26.0,TOT,AL,2,2,0.5,3.26,33,...,6,172,124.0,4.77,1.474,8.1,1.2,5.1,8.8,1.73
2,3,Bryan Abreu,25.0,HOU,AL,4,0,1.0,1.94,55,...,7,248,202.0,2.12,1.177,6.7,0.3,3.9,13.1,3.38
3,4,Domingo Acevedo,28.0,OAK,AL,4,4,0.5,3.33,70,...,1,266,112.0,4.01,0.99,6.7,1.2,2.3,7.7,3.41
4,5,Jason Adam,30.0,TBR,AL,2,3,0.4,1.56,67,...,2,237,233.0,2.86,0.758,4.4,0.7,2.4,10.7,4.41


In [3]:
# Clean the dataframe and keep only needed columns
selected_columns = ['Name', 'Tm', 'ERA', 'WHIP']
pitching_df_clean = pitching_df[selected_columns].copy()

pitching_df_clean.head()

Unnamed: 0,Name,Tm,ERA,WHIP
0,Cory Abbott,WSN,5.25,1.438
1,Albert Abreu,TOT,3.26,1.474
2,Bryan Abreu,HOU,1.94,1.177
3,Domingo Acevedo,OAK,3.33,0.99
4,Jason Adam,TBR,1.56,0.758


In [4]:
# Read in the CSV file as a Pandas DataFrame
salaries_df = pd.read_csv('Salary_data.csv')
salaries_df.head()

Unnamed: 0,Name,Position,Salary
0,Max Scherzer,SP,"$43,333,333"
1,Mike Trout,CF,"$37,216,667"
2,Anthony Rendon,3B,"$36,571,428"
3,Gerrit Cole,SP,"$36,000,000"
4,Jacob deGrom,SP,"$36,000,000"


In [5]:
# Convert Salary data type to float
salaries_df['Salary'] = salaries_df['Salary'].replace('[\$,]', '', regex=True).astype(float)
salaries_df.head()

Unnamed: 0,Name,Position,Salary
0,Max Scherzer,SP,43333333.0
1,Mike Trout,CF,37216667.0
2,Anthony Rendon,3B,36571428.0
3,Gerrit Cole,SP,36000000.0
4,Jacob deGrom,SP,36000000.0


In [6]:
# Merge pitching_data and salary_data on Name
pitching_players_df = pd.merge(salaries_df, pitching_df_clean, on = "Name")
pitching_players_df.head()

Unnamed: 0,Name,Position,Salary,Tm,ERA,WHIP
0,Max Scherzer,SP,43333333.0,NYM,2.29,0.908
1,Gerrit Cole,SP,36000000.0,NYY,3.5,1.017
2,Jacob deGrom,SP,36000000.0,NYM,3.08,0.746
3,Stephen Strasburg,SP,35000000.0,WSN,13.5,2.143
4,David Price,RP,32000000.0,LAD,2.45,1.165


In [7]:
# 2023 Average ERA
era_spec = 4.28

In [8]:
# Reshape the data
data = pitching_players_df['ERA'].values.reshape(-1, 1)

# Create and initialize the K-means model instance for 2 clusters
kmeans = KMeans(n_clusters = 2, random_state = 42)

# Fit the data to the instance of the model
kmeans.fit(data)

KMeans(n_clusters=2, random_state=42)

In [9]:
# Get the cluster centers
cluster_centers = kmeans.cluster_centers_

# Compare cluster centers with the soecified conditioin and convert to labels
labels = (cluster_centers < era_spec).astype(int)

In [10]:
# Make predictions about the data clusters using the trained model
pitching_players_df['ERAR'] = np.where(data < era_spec, labels[1], labels[0])

# Print the predictions
pitching_players_df.head()

Unnamed: 0,Name,Position,Salary,Tm,ERA,WHIP,ERAR
0,Max Scherzer,SP,43333333.0,NYM,2.29,0.908,0
1,Gerrit Cole,SP,36000000.0,NYY,3.5,1.017,0
2,Jacob deGrom,SP,36000000.0,NYM,3.08,0.746,0
3,Stephen Strasburg,SP,35000000.0,WSN,13.5,2.143,1
4,David Price,RP,32000000.0,LAD,2.45,1.165,0


In [11]:
# 2023 Average WHIP
whip_spec = 1.316

In [12]:
# Reshape the data
data = pitching_players_df['WHIP'].values.reshape(-1, 1)

# Create and initialize the K-means model instance for 2 clusters
kmeans = KMeans(n_clusters = 2, random_state = 42)

# Fit the data to the instance of the model
kmeans.fit(data)

KMeans(n_clusters=2, random_state=42)

In [13]:
# Get the cluster centers
cluster_centers = kmeans.cluster_centers_

# Compare cluster centers with the soecified conditioin and convert to labels
labels = (cluster_centers < whip_spec).astype(int)

In [14]:
# Make predictions about the data clusters using the trained model
pitching_players_df['WHIPR'] = np.where(data < whip_spec, labels[1], labels[0])

# Print the predictions
pitching_players_df.head()

Unnamed: 0,Name,Position,Salary,Tm,ERA,WHIP,ERAR,WHIPR
0,Max Scherzer,SP,43333333.0,NYM,2.29,0.908,0,0
1,Gerrit Cole,SP,36000000.0,NYY,3.5,1.017,0,0
2,Jacob deGrom,SP,36000000.0,NYM,3.08,0.746,0,0
3,Stephen Strasburg,SP,35000000.0,WSN,13.5,2.143,1,1
4,David Price,RP,32000000.0,LAD,2.45,1.165,0,0


In [15]:
# Budget
budget = 10000000

In [16]:
# Reshape the data
data = pitching_players_df['Salary'].values.reshape(-1, 1)

# Create and initialize the K-means model instance for 2 clusters
kmeans = KMeans(n_clusters = 2, random_state = 42)

# Fit the data to the instance of the model
kmeans.fit(data)

KMeans(n_clusters=2, random_state=42)

In [17]:
# Get the cluster centers
cluster_centers = kmeans.cluster_centers_

# Compare cluster centers with the soecified conditioin and convert to labels
labels = (cluster_centers < budget).astype(int)

In [18]:
# Make predictions about the data clusters using the trained model
pitching_players_df['In_budget'] = np.where(data < budget, labels[1], labels[0])

# Print the predictions
pitching_players_df.head()

Unnamed: 0,Name,Position,Salary,Tm,ERA,WHIP,ERAR,WHIPR,In_budget
0,Max Scherzer,SP,43333333.0,NYM,2.29,0.908,0,0,1
1,Gerrit Cole,SP,36000000.0,NYY,3.5,1.017,0,0,1
2,Jacob deGrom,SP,36000000.0,NYM,3.08,0.746,0,0,1
3,Stephen Strasburg,SP,35000000.0,WSN,13.5,2.143,1,1,1
4,David Price,RP,32000000.0,LAD,2.45,1.165,0,0,1


In [19]:
# Dropping 0 from all results columns
columns_to_check = ['ERAR', 'WHIPR', 'In_budget']
pitching_players_df = pitching_players_df[(pitching_players_df[columns_to_check] < 1).all(axis=1)]
pitching_players_df

Unnamed: 0,Name,Position,Salary,Tm,ERA,WHIP,ERAR,WHIPR,In_budget
51,Alex Cobb,SP,9000000.0,SFG,3.73,1.303,0,0,0
52,Chris Bassitt,SP,8650000.0,NYM,3.42,1.145,0,0,0
53,Joe Musgrove,SP,8625000.0,SDP,2.93,1.083,0,0,0
55,Tyler Anderson,SP,8500000.0,LAD,2.57,1.002,0,0,0
56,Andrew Heaney,SP,8500000.0,LAD,3.10,1.087,0,0,0
...,...,...,...,...,...,...,...,...,...
403,Steven Wilson,RP,700000.0,SDP,3.06,1.057,0,0,0
404,Colin Poche,RP,700000.0,TBR,3.99,1.159,0,0,0
408,Ryan Thompson,RP,700000.0,TBR,3.80,1.172,0,0,0
409,Jandel Gustave,RP,700000.0,MIL,3.86,1.286,0,0,0


In [21]:
# Export pitching_players_df to CSV
pitching_players_df.to_csv('pitching_players.csv', index=False)