# Predicting Baseball Hall of Famers
### Joel Simon
#### MA 544 Final Project

In [1]:
#All the import statements

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

# Scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#File paths
hall_of_fame_path = "./bgadoci-lahmans-baseball-database/original/HallOfFame.csv"
all_star_path = "./bgadoci-lahmans-baseball-database/original/AllstarFull.csv"
awards_path = "./bgadoci-lahmans-baseball-database/original/AwardsPlayers.csv"
master_path = "./bgadoci-lahmans-baseball-database/original/Master.csv"

pitching_path = "./bgadoci-lahmans-baseball-database/original/Pitching.csv"
batting_path = "./bgadoci-lahmans-baseball-database/original/Batting.csv"
pitching_post_path = "./bgadoci-lahmans-baseball-database/original/PitchingPost.csv"
batting_post_path = "./bgadoci-lahmans-baseball-database/original/BattingPost.csv"

In [3]:
#Saving all of the data

hall_of_famers_data = pd.read_csv(hall_of_fame_path, sep=",", header=0, usecols=["playerID", "inducted"], converters={"inducted": lambda i: 1 if i == "Y" else 0})
all_star_data = pd.read_csv(all_star_path, sep=",", header=0, usecols=["playerID", "yearID"])
awards_data = pd.read_csv(awards_path, sep=",", header=0, usecols=["playerID", "yearID"])
master_data = pd.read_csv(master_path, sep=",", header=0, usecols=["playerID", "finalGame"]).fillna("2023-01-01")
master_data["finalGame"] = pd.to_datetime(master_data["finalGame"])

pitching_data = pd.read_csv(pitching_path, sep=",", header=0).fillna(0).drop(["stint", "teamID", "lgID"], axis=1)
batting_data = pd.read_csv(batting_path, sep=",", header=0).fillna(0).drop(["stint", "teamID", "lgID"], axis=1)
pitching_post_data = pd.read_csv(pitching_post_path, sep=",", header=0).fillna(0).drop(["round", "teamID", "lgID"], axis=1)
batting_post_data = pd.read_csv(batting_post_path, sep=",", header=0).fillna(0).drop(["round", "teamID", "lgID"], axis=1)

In [4]:
'''
This block is to combine the post season and regular season data

This is done by concatinating the data, and then aggrigating them together
All of the functions show that the data from the regular and post season get added together

In the end, each player will have one (or zero) entries per year
'''

pitching_data = pd.concat([pitching_data, pitching_post_data], ignore_index=True)
batting_data = pd.concat([batting_data, batting_post_data], ignore_index=True)

pitching_funcs = {'W': 'sum', 'L': 'sum', 'G': 'sum', 'GS': 'sum', 'CG': 'sum', 'SHO': 'sum', 'SV': 'sum', 'IPouts': 'sum', 
                  'H': 'sum', 'ER': 'sum', 'HR': 'sum', 'BB': 'sum', 'SO': 'sum', 'BAOpp': 'first', 'ERA': 'first', 
                  'IBB': 'sum', 'WP': 'sum', 'HBP': 'sum', 'BK': 'sum', 'BFP': 'sum', 'GF': 'sum', 'R': 'sum', 'SH': 'sum',
                  'SF': 'sum', 'GIDP': 'sum'}

batting_funcs =  {'G': 'sum', 'AB': 'sum', 'R': 'sum', 'H': 'sum', '2B': 'sum', '3B': 'sum', 'HR': 'sum', 'RBI': 'sum', 
                  'SB': 'sum', 'CS': 'sum', 'BB': 'sum', 'SO': 'sum', 'IBB': 'sum', 'HBP': 'first', 'SH': 'first',
                  'SF': 'sum', 'GIDP': 'sum'}

pitching_data = pitching_data.groupby(["playerID", "yearID"]).agg(pitching_funcs).reset_index()
batting_data = batting_data.groupby(["playerID", "yearID"]).agg(batting_funcs).reset_index()

In [5]:
# What some of the Batting Data looks like.
# Very similar to the pitching data
batting_data[1000:1010]

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
1000,allenet01,1931,94,298.0,58.0,98.0,18.0,2.0,5.0,43.0,6.0,0.0,15.0,15.0,0.0,1.0,7.0,0.0,0.0
1001,allenet01,1932,54,103.0,13.0,18.0,6.0,2.0,1.0,7.0,0.0,0.0,1.0,12.0,0.0,2.0,1.0,0.0,0.0
1002,allenet01,1933,91,261.0,25.0,63.0,7.0,3.0,0.0,36.0,3.0,0.0,13.0,22.0,0.0,1.0,6.0,0.0,4.0
1003,allenet01,1934,145,581.0,87.0,192.0,42.0,4.0,10.0,85.0,6.0,0.0,33.0,47.0,0.0,3.0,16.0,0.0,16.0
1004,allenet01,1935,156,645.0,90.0,198.0,46.0,1.0,8.0,63.0,5.0,0.0,43.0,54.0,0.0,1.0,11.0,0.0,5.0
1005,allenet01,1936,121,498.0,68.0,147.0,21.0,7.0,4.0,48.0,16.0,0.0,17.0,38.0,0.0,0.0,3.0,0.0,13.0
1006,allenet01,1937,103,320.0,39.0,101.0,18.0,1.0,0.0,31.0,3.0,4.0,21.0,17.0,0.0,1.0,7.0,0.0,0.0
1007,allenet01,1938,19,33.0,4.0,10.0,3.0,1.0,0.0,4.0,0.0,0.0,2.0,4.0,0.0,0.0,1.0,0.0,0.0
1008,allenfr01,1912,20,36.0,4.0,6.0,3.0,1.0,1.0,4.0,0.0,0.0,3.0,15.0,0.0,0.0,2.0,0.0,0.0
1009,allenfr01,1913,34,51.0,5.0,7.0,0.0,1.0,1.0,3.0,0.0,0.0,3.0,23.0,0.0,0.0,2.0,0.0,0.0


In [6]:
# Returns a 18847x145 matrix with the scores of each player
def get_scores(n_clusters):
    #Create the large (18847x145) matrix
    player_scores = pd.DataFrame(index=master_data["playerID"])

    starting_year = 1871
    ending_year = 2015
    
    #Calculate the to give to pitchers
    def calculate_pitching_scores(year, clusters):
        #Get the pitcher from the given year and their names
        current_pitching_data = pitching_data[pitching_data["yearID"]==year]
        current_names = current_pitching_data["playerID"]
        current_pitching_data = current_pitching_data.drop(["playerID", "yearID"], axis=1)
    
        #Scale the data to a normal distribution
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(current_pitching_data)
        
        #Do KMeans clustering
        km = KMeans(n_clusters=clusters, max_iter=1000, tol=0.001, init="random")
        km.fit(scaled_data)
        d = pd.DataFrame(km.labels_, index=current_names, columns=["class"])
        
        #Go though all the award winners and add the class they are in to a set
        classes = set()
        for n in awards_data[awards_data["yearID"]==year]["playerID"]:
            if n in d.index:
                classes.add(d.loc[n, "class"])
        
        #For each class in the set, give them a point
        for n in d[d["class"].isin(classes)].index:
            player_scores.loc[n, year] += 1

    
    #Calculate the score to give to batters
    #The same as the pitchers scoring
    def calculate_batting_scores(year, clusters):
        current_batting_data = batting_data[batting_data["yearID"]==year]
        current_names = current_batting_data["playerID"]
        current_batting_data = current_batting_data.drop(["playerID", "yearID"], axis=1)

        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(current_batting_data)

        km = KMeans(n_clusters=clusters, max_iter=1000, tol=0.001, init="random")
        km.fit(scaled_data)
        d = pd.DataFrame(km.labels_, index=current_names, columns=["class"])

        classes = set()
        for n in awards_data[awards_data["yearID"]==year]["playerID"]:
            if n in d.index:
                classes.add(d.loc[n, "class"])

        for n in d[d["class"].isin(classes)].index:
            player_scores.loc[n, year] += 1

    temp_df = pd.DataFrame(columns=list(range(starting_year, ending_year+1)), index=player_scores.index)

    player_scores = pd.concat([player_scores, temp_df], axis=1)

    for current_year in range(starting_year, ending_year+1):
        #Set the current year scores to 0.  
        #Then calculate and add the scored based off of pitchers and batters
        player_scores[current_year] = 0
        calculate_pitching_scores(current_year, n_clusters)
        calculate_batting_scores(current_year, n_clusters)
        
        #Points for being an all star or award winner
        for n in awards_data[awards_data["yearID"]==current_year]["playerID"]:
            player_scores.loc[n,current_year] += 1

        for n in all_star_data[all_star_data["yearID"]==current_year]["playerID"]:
            player_scores.loc[n,current_year] += 1
            
    return player_scores

In [7]:
#Get all the players who have been inducted into the hall of fame
in_the_hall = pd.DataFrame(index=master_data["playerID"])
in_the_hall["in_hall"]=0
in_the_hall.loc[hall_of_famers_data[hall_of_famers_data["inducted"]==1]["playerID"]]=1

In [8]:
#Remove players who are unable to be inducted
#Must be out of the league for 5 years, and because the data ends in 2015, you must have finished before 2010
#Additionally, if the entire row is 0, there should be no chance you make it to the Hall of Fame, so they are removed to improve recall
too_young = master_data[master_data["finalGame"]>dt.datetime(2010, 1, 1)]["playerID"].tolist()
def remove_new_or_mediocre(scores, hof):
    scores = scores.drop(too_young, axis=0)
    hof = hof.drop(too_young, axis=0)
    
    zero_score = scores[scores.sum(axis=1)==0].index
    scores = scores.drop(zero_score, axis=0)
    hof = hof.drop(zero_score, axis=0)
    
    return scores, hof

In [9]:
regs = {}
# Test cluster from 4 to 17
for num_clusters in range(4, 18):
    #Get the scores and remove the unhelpful players
    scores = get_scores(num_clusters)
    
    scores, hof = remove_new_or_mediocre(scores, in_the_hall)
    scores = csr_matrix(scores.values)
    
    #Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(scores, hof, test_size=0.3)

    y_train = y_train["in_hall"].to_list()
    y_test = y_test["in_hall"].to_list()
    
    #Do the regression, and save it
    log_regression = LogisticRegression(tol=0.0001, max_iter=2000, penalty='l2')

    log_regression.fit(X_train, y_train)
    
    regs[num_clusters] = {"reg": log_regression, "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test}

In [10]:
# Go through each regression and print the accuracy, Confusion Matrix, and Report
for c in range(4, 18):
    c_r = regs[c]["reg"]
    X_train, y_train, X_test, y_test = regs[c]["X_train"], regs[c]["y_train"], regs[c]["X_test"], regs[c]["y_test"]
    print("# Clusters:", c)
    print ("Train Accuracy: "+ str(c_r.score(X_train, y_train)))
    print ("Test Accuracy: "+ str(c_r.score(X_test, y_test)))
    
    conf_matrix = confusion_matrix(y_test, c_r.predict(X_test))
    class_report = classification_report(y_test, c_r.predict(X_test))

    print(conf_matrix)
    print(class_report)
    print()

# Clusters: 4
Train Accuracy: 0.9903488488991655
Test Accuracy: 0.9880393996247655
[[4181    9]
 [  42   32]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4190
           1       0.78      0.43      0.56        74

    accuracy                           0.99      4264
   macro avg       0.89      0.72      0.78      4264
weighted avg       0.99      0.99      0.99      4264


# Clusters: 5
Train Accuracy: 0.9907564810879728
Test Accuracy: 0.9878532473971244
[[3957    9]
 [  40   28]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3966
           1       0.76      0.41      0.53        68

    accuracy                           0.99      4034
   macro avg       0.87      0.70      0.76      4034
weighted avg       0.99      0.99      0.99      4034


# Clusters: 6
Train Accuracy: 0.9915516420752023
Test Accuracy: 0.9822320932815103
[[3513   13]
 [  51   25]]
              