<a href="https://colab.research.google.com/github/jrsansom110515/team2_w25_milestone2_datafiles/blob/main/team2_w25_milestone2_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings

In [2]:
"""Clone the git repository that holds the CSV files"""
!git clone https://github.com/jrsansom110515/team2_w25_milestone2_datafiles.git


Cloning into 'team2_w25_milestone2_datafiles'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 9 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (9/9), 1.20 MiB | 6.68 MiB/s, done.


In [27]:
""" This cell contains key functions used through out the remaining workbook"""

# Function to load in datasets

def load_data_csv(files_list):
    if len(files_list) != 2:
        raise ValueError("files_list must contain exactly two file names.")

    # Replace the file path with your location.
    MLB_df = pd.read_csv('/content/team2_w25_milestone2_datafiles/' + files_list[1])
    MiLB_df = pd.read_csv('/content/team2_w25_milestone2_datafiles/' + files_list[0])

    return MiLB_df, MLB_df

# Function to merge datasets and create a new column from the payerID to signify MLB debut

def merge_datasets(MiLB_df, MLB_df):
    MiLB_df['PlayerId'] = MiLB_df['PlayerId'].astype(str)
    MLB_df['PlayerId'] = MLB_df['PlayerId'].astype(str)

    merged_df = MiLB_df.merge(MLB_df[['PlayerId']], on = 'PlayerId', how = 'left', indicator = True)

    merged_df['MLB_debut'] = merged_df['_merge'].apply(lambda x: 1 if x == 'both' else 0)

    merged_df = merged_df.drop(columns=['PlayerId', '_merge', 'Name', 'Team'])

    return merged_df

# Function to extract age and level

def age_level(MiLB_m_df):

    # Get max age
    MiLB_m_df['max_age'] = MiLB_m_df['Age'].apply(lambda x: x.split('-')[1]).astype(int)

    # Get levels into dummy variables
    dummy_df = MiLB_m_df['Level'].str.get_dummies(',')
    dummy_df['num_level'] = dummy_df.sum(axis = 1)
    MiLB_m_df = pd.concat([MiLB_m_df,dummy_df], axis=1)
    MiLB_m_df = MiLB_m_df.drop(columns=['Level', 'Age'])

    return MiLB_m_df

# Function combine two datasets for train_test_split

def combine_dataset(df1, df2):
  combine_df_NA = pd.concat([df1, df2], axis = 0).reset_index(drop = True)
  combine_df = combine_df_NA.dropna(axis = 1)

  return combine_df

# Function to define features for X and y to setup and perform testing, the return is the train and test variables

def training_testing(df, target_feature, test_size):
  X = df.drop(columns = [target_feature])
  y = df[target_feature]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)

  return X_train, X_test, y_train, y_test


In [4]:
"""
Datasets that contains all MiLB batter data and datasets for MLB debut

These datasets names are the same as the ones previously shared, so they these list should be fine
as long as the names are not changed.
"""
milb_18_20, mlb_21_22 = load_data_csv(['Historical_MiLB_players_dataset_2018_2020.csv', 'MLB_debut_2021_2022.csv'])
milb_19_21, mlb_22_23 = load_data_csv(['Historical_MiLB_players_dataset_2019_2021.csv', 'MLB_debut_2022_2023.csv'])
milb_20_22, mlb_23_24 = load_data_csv(['Historical_MiLB_players_dataset_2020_2022.csv', 'MLB_debut_2023_2024.csv'])

In [5]:
milb_18_20.head()

Unnamed: 0,Name,Team,Level,Age,PA,BB%,K%,BB/K,AVG,OBP,...,OPS,ISO,Spd,BABIP,wSB,wRC,wRAA,wOBA,wRC+,PlayerId
0,Christian Bethancourt,MIL,AAA,26-26,418,0.047847,0.181818,0.263158,0.296675,0.327751,...,0.834145,0.209719,3.856732,0.318937,0.664623,61.600626,7.035243,0.358492,112.893234,10028
1,Tomas Telis,MIA,AAA,27-28,633,0.056872,0.099526,0.571429,0.319728,0.360759,...,0.806338,0.12585,3.715513,0.340426,-0.26136,90.12304,5.657395,0.349854,106.467915,10067
2,Nick Franklin,MIL,"A+,AA,AAA,R",27-28,416,0.125,0.185096,0.675325,0.257062,0.351807,...,0.775536,0.166667,5.337587,0.296703,-0.062833,56.880421,0.111199,0.342564,101.249987,10166
3,Francisco Arcia,LAA,"AA,AAA",28-29,432,0.050926,0.162037,0.314286,0.232737,0.287059,...,0.588849,0.069054,3.268657,0.271875,-1.108097,29.449425,-29.670384,0.267188,52.290427,10286
4,Danny Ortiz,PHI,AAA,28-28,418,0.045455,0.241627,0.188119,0.232143,0.270335,...,0.668294,0.165816,1.629498,0.271429,-0.482535,39.405764,-7.674912,0.295148,83.698381,10317


In [9]:
"""
Merged datasets creating a new column of MLB debut with a 1 if player made debut in these years or 0 if they did not

Change the age column and level from a categorical variable to a numeric variable for modeling purposes
"""

# merge the above datasets
milb_18_20_mlb_21_22 = merge_datasets(milb_18_20, mlb_21_22)
milb_19_21_mlb_22_23 = merge_datasets(milb_19_21, mlb_22_23)
milb_20_22_mlb_23_24 = merge_datasets(milb_20_22, mlb_23_24)

# change the age and level columns to numeric
milb_18_20_mlb_21_22 = age_level(milb_18_20_mlb_21_22)
milb_19_21_mlb_22_23 = age_level(milb_19_21_mlb_22_23)
milb_20_22_mlb_23_24 = age_level(milb_20_22_mlb_23_24)


In [22]:
milb_18_20_mlb_21_22.head()

Unnamed: 0,PA,BB%,K%,BB/K,AVG,OBP,SLG,OPS,ISO,Spd,...,wRC+,MLB_debut,max_age,A,A+,A-,AA,AAA,R,num_level
0,418,0.047847,0.181818,0.263158,0.296675,0.327751,0.506394,0.834145,0.209719,3.856732,...,112.893234,0,26,0,0,0,0,1,0,1
1,633,0.056872,0.099526,0.571429,0.319728,0.360759,0.445578,0.806338,0.12585,3.715513,...,106.467915,0,28,0,0,0,0,1,0,1
2,416,0.125,0.185096,0.675325,0.257062,0.351807,0.423729,0.775536,0.166667,5.337587,...,101.249987,0,28,0,1,0,1,1,1,4
3,432,0.050926,0.162037,0.314286,0.232737,0.287059,0.30179,0.588849,0.069054,3.268657,...,52.290427,0,29,0,0,0,1,1,0,2
4,418,0.045455,0.241627,0.188119,0.232143,0.270335,0.397959,0.668294,0.165816,1.629498,...,83.698381,0,28,0,0,0,0,1,0,1


In [18]:
"""Create the dataset for train_test_split to use on the early stage datasets"""

tts_dataset = combine_dataset(milb_18_20_mlb_21_22, milb_19_21_mlb_22_23)

In [19]:
tts_dataset


Unnamed: 0,PA,BB%,K%,BB/K,AVG,OBP,SLG,OPS,ISO,Spd,...,wRC+,MLB_debut,max_age,A,A+,A-,AA,AAA,R,num_level
0,418,0.047847,0.181818,0.263158,0.296675,0.327751,0.506394,0.834145,0.209719,3.856732,...,112.893234,0,26,0,0,0,0,1,0,1
1,633,0.056872,0.099526,0.571429,0.319728,0.360759,0.445578,0.806338,0.125850,3.715513,...,106.467915,0,28,0,0,0,0,1,0,1
2,416,0.125000,0.185096,0.675325,0.257062,0.351807,0.423729,0.775536,0.166667,5.337587,...,101.249987,0,28,0,1,0,1,1,1,4
3,432,0.050926,0.162037,0.314286,0.232737,0.287059,0.301790,0.588849,0.069054,3.268657,...,52.290427,0,29,0,0,0,1,1,0,2
4,418,0.045455,0.241627,0.188119,0.232143,0.270335,0.397959,0.668294,0.165816,1.629498,...,83.698381,0,28,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5282,465,0.103226,0.217204,0.475248,0.226277,0.308190,0.381995,0.690185,0.155718,3.362663,...,87.617538,0,21,0,1,0,0,0,1,2
5283,674,0.068249,0.296736,0.230000,0.213592,0.274481,0.309061,0.583542,0.095469,4.104955,...,69.510016,0,21,1,1,0,0,0,0,3
5284,801,0.113608,0.415730,0.273273,0.234957,0.331665,0.395415,0.727080,0.160458,6.516063,...,102.109879,0,22,1,1,0,0,0,0,2
5285,602,0.117940,0.287375,0.410405,0.221797,0.312187,0.328872,0.641059,0.107075,4.155988,...,83.334165,0,21,1,1,0,0,0,1,3


In [28]:
"""
Run the function from the function list to return X_train, X_test, y_train, y_test

The function takes 3 inputs. 1 dataframe,
1 string name of the feature column that is the dependent variable,
and a test size as a float
"""

X_train, X_test, y_train, y_test = training_testing(tts_dataset, 'MLB_debut', 0.2)


In [33]:
""" Use models to start training and testing"""

# Random Forest Classification
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

acc_test_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy on Split Test Set: {acc_test_rf:.2f}")
print("\nRandom Forest Classification Report on Split Test Set:")
print(classification_report(y_test, y_pred_rf))

print('-------------------------------------------------------------------------')

# Logistic Regression Classification
lr_model = LogisticRegression(max_iter = 1000, random_state = 42)

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

acc_test_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy on Split Test Set: {acc_test_lr:.2f}")
print("\nLogistic Regression Classification Report on Split Test Set:")
print(classification_report(y_test, y_pred_lr))

Random Forest Accuracy on Split Test Set: 0.96

Random Forest Classification Report on Split Test Set:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1014
           1       0.29      0.05      0.08        44

    accuracy                           0.96      1058
   macro avg       0.62      0.52      0.53      1058
weighted avg       0.93      0.96      0.94      1058

-------------------------------------------------------------------------
Logistic Regression Accuracy on Split Test Set: 0.95

Logistic Regression Classification Report on Split Test Set:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1014
           1       0.30      0.07      0.11        44

    accuracy                           0.95      1058
   macro avg       0.63      0.53      0.54      1058
weighted avg       0.93      0.95      0.94      1058



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
