## Cleaning

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics, preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

Read in player data

In [14]:
playerDf = pd.read_csv('trainingData/CollegeBasketballPlayers2009-2021.csv', low_memory=False)

#initial dropping of inconsistant data for player data 
playerDf = playerDf.rename(columns = {'Unnamed: 64' : 'role_position'})
playerDf = playerDf.drop('Unnamed: 65', axis=1) # Has no meaning in the dataset
playerDf = playerDf.drop('ht', axis=1) # Temporarily excluding this column due to inconsistent date formats and other values
playerDf = playerDf.drop('num', axis=1) # It might denote a player's choice or position but with significant variation in meaning.
playerDf = playerDf.drop('type', axis=1) # Column has 1 unique value
playerDf= playerDf.drop('pid', axis=1) # catigorical, not needed 
playerDf= playerDf.drop('yr', axis=1) # catigorical, not needed 
playerDf = playerDf.drop('player_name', axis=1)
playerDf = playerDf.drop('team', axis=1)
playerDf = playerDf.drop('conf', axis=1)
playerDf.shape

(61061, 57)

Display columns with over 60% nulls and drop them

In [15]:
# Count and remove columns with over 60% nan values from players
total = len(playerDf)
for column in playerDf.columns:
    count = playerDf[column].isna().sum()
    percent= (count/ total) * 100
    if (percent>=60):
        print(f"Player  DF: Column '{column}':", f"Number of NaN: {count}", f"Percentage: {percent:.2f}%")

Player  DF: Column 'Rec Rank': Number of NaN: 42591 Percentage: 69.75%
Player  DF: Column 'pick': Number of NaN: 59626 Percentage: 97.65%


In [19]:
#Drop all the over 60% cols from the player. Dont drop pick, nan recorded as not drafted, need for predictions filling in later
playerDf = playerDf.drop('Rec Rank', axis=1) 
role_position_label_encoder = LabelEncoder()
playerDf['role_position'] = role_position_label_encoder.fit_transform(playerDf['role_position'])

KeyError: "['Rec Rank'] not found in axis"

In [17]:
#player_corr_matrix = playerDf.corr()

# Create a heatmap
# plt.figure(figsize=(12, 10))
# sns.heatmap(player_corr_matrix, fmt=".2f", linewidths=.5)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

Findand remove redundant data 

In [18]:
correlation_threshold = 0.9

# Create positive correlation matrix
corr_df = playerDf.corr().abs()


# Create and apply mask
mask = np.triu(np.ones_like(corr_df, dtype=bool))
tri_df = corr_df.mask(mask)

# Find columns that meet threshold
features_to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.90)]
print(features_to_drop)

ValueError: could not convert string to float: 'Combo G'

In [7]:
# Remove the highly correlated features from the DataFrame
playerDf = playerDf.drop(columns=features_to_drop, axis=1)

# Display the new shape of the DataFrame
print("Shape of the DataFrame after removing highly correlated features:", playerDf.shape)

Shape of the DataFrame after removing highly correlated features: (61061, 43)


In [8]:
player_corr_matrix = playerDf.corr()

# # Create a heatmap
# plt.figure(figsize=(12, 10))
# sns.heatmap(player_corr_matrix, fmt=".2f", linewidths=.5)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

KNN to fill in all NAN values

In [20]:
#if drafted, change to 1 
playerDf.loc[playerDf["pick"] > 0, "pick"] = 1
#not drafted was nan, replace with 0 
playerDf.loc[playerDf["pick"] != 1, "pick"] = 0

In [22]:
#team_label_encoder = LabelEncoder()
#playerDf['team'] = team_label_encoder.fit_transform(playerDf['team'])

#conf_label_encoder = LabelEncoder()
#playerDf['conf'] = conf_label_encoder.fit_transform(playerDf['conf'])

role_position_label_encoder = LabelEncoder()
playerDf['role_position'] = role_position_label_encoder.fit_transform(playerDf['role_position'])

#player_name_label_encoder = LabelEncoder()
#playerDf['player_name'] = player_name_label_encoder.fit_transform(playerDf['player_name'])

In [23]:
imputer = KNNImputer(n_neighbors=10)
imputed_numeric = imputer.fit_transform(playerDf.select_dtypes(include=np.number))
dfBasketballTraining_filled = pd.DataFrame(imputed_numeric, columns=playerDf.select_dtypes(include=np.number).columns)
string_columns = playerDf.select_dtypes(include='object').columns
imputed_data = pd.concat([playerDf[string_columns], dfBasketballTraining_filled], axis=1)

KeyboardInterrupt: 

In [12]:
playerDf = pd.DataFrame(imputed_data, columns=playerDf.columns)

Split into training and test 

Remove overlapping players based on 4 year college career so that the training set is all "new" players

In [28]:
trainDf=playerDf.loc[playerDf['year'] <2017]
testDf= playerDf.loc[playerDf['year'] >= 2021]

Feature selection 

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

X = trainDf.drop("pick", axis=1)
y = trainDf["pick"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit a RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)

#importance
sel = SelectFromModel(rf_classifier)
sel.fit(X_train, y_train)

# Get support and selected feature names
selected_feature_indices = sel.get_support()
selected_features = X_train.columns[selected_feature_indices]

print("Selected features:")
print(selected_features)

print("Feature importance scores:")
print(sel.estimator_.feature_importances_)


Selected features:
Index(['GP', 'FTA', 'porpag', 'adjoe', 'midmade+midmiss',
       'dunksmiss+dunksmade', 'adrtg', 'bpm', 'obpm', 'dbpm', 'ogbpm', 'dgbpm',
       'pts'],
      dtype='object')
Feature importance scores:
[0.01974546 0.01812142 0.01495799 0.03602123 0.01963879 0.01879017
 0.01637709 0.01580989 0.0165297  0.0194236  0.02705902 0.01698366
 0.0170594  0.01951758 0.0136119  0.01720388 0.01455087 0.01831332
 0.05148607 0.03968478 0.01740149 0.01001954 0.02169502 0.02029286
 0.03444535 0.01931278 0.01795    0.03732596 0.01494993 0.05847997
 0.06227992 0.03084766 0.02576788 0.03402688 0.02826008 0.01884714
 0.02171011 0.02016268 0.01791246 0.02323173 0.0264026  0.00779213]


In [15]:
trainDf.loc[trainDf['player_name']==1314.0 ]

Unnamed: 0,player_name,team,conf,GP,usg,TS_per,ORB_per,DRB_per,AST_per,TO_per,...,dbpm,ogbpm,dgbpm,oreb,treb,ast,stl,blk,pts,role_position
4,1314.0,230.0,10.0,33.0,22.0,54.31,8.3,18.6,8.2,22.7,...,0.109983,-1.68486,-0.668318,1.4242,4.7273,0.8485,0.4545,0.3333,7.5758,8.0
685,1314.0,225.0,6.0,33.0,10.7,53.23,7.4,10.2,3.4,16.8,...,2.07682,-1.03182,1.49005,0.8485,2.0606,0.2727,0.2727,0.3939,2.3939,8.0
8409,1314.0,344.0,24.0,4.0,8.7,0.0,20.1,20.1,0.0,0.0,...,-6.67043,-3.67591,-5.35212,0.25,0.5,0.0,0.0,0.0,0.0,2.0
12523,1314.0,285.0,24.0,30.0,18.7,53.9,5.1,11.3,7.0,14.4,...,0.348772,2.72518,0.49009,1.0333,3.2333,0.7667,0.5333,0.2333,8.7,6.0
15649,1314.0,285.0,25.0,37.0,18.8,49.47,4.2,15.5,8.5,17.3,...,1.69868,0.576665,1.46138,0.8649,4.027,1.027,0.6216,0.1081,8.0811,7.0
19376,1314.0,285.0,25.0,4.0,17.0,28.64,1.6,13.3,2.6,24.7,...,-0.184311,-6.91902,-0.210866,0.25,2.25,0.25,0.25,0.25,3.0,6.0
23376,1314.0,285.0,25.0,36.0,18.5,61.0,3.3,13.9,12.4,18.5,...,1.45842,2.71659,1.4099,0.9167,5.0,2.0833,0.8611,0.1111,12.3056,7.0
27789,1314.0,285.0,25.0,37.0,21.2,57.3,3.8,18.8,15.2,15.7,...,1.41825,5.2572,1.03588,1.1351,6.9189,2.5405,0.7568,0.1892,14.7838,7.0
36508,1314.0,109.0,14.0,27.0,12.7,49.88,1.7,10.9,9.1,22.8,...,-1.33441,-3.84162,-0.478065,0.1852,1.2593,0.6296,0.4074,0.0,2.3333,7.0


## Model

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [25]:
#Trying with k-fold cross validation 

In [26]:
def get_training_data(dataframe, replacement, features = 13, multiplicity = 10):
    trainingCount = int(features*multiplicity/2)
    not_drafted = dataframe[dataframe['pick'] == 0].sample(n=trainingCount, replace=replacement)
    drafted = dataframe[dataframe['pick'] == 1].sample(n=trainingCount, replace=replacement)
    return pd.concat([not_drafted, drafted], ignore_index=True)
    

In [71]:
#get divided training data for k-crossfold validation 
#all features (no pick)
# features=['usg', 'DRB_per', 'AST_per', 'TO_per',  'twoP_per', 'blk_per', 'porpag','bpm', 'dbpm', 'ogbpm', 'dgbpm']
# features = ['GP', 'FTA', 'porpag', 'adjoe', 'adrtg', 'bpm', 'obpm', 'dbpm', 'ogbpm', 'dgbpm','blk', 'pts']
# features = ['GP', 'FTA', 'porpag', 'adjoe', 'adrtg', 'bpm', 'obpm', 'dbpm', 'blk', 'pts']
features=['porpag', 'adjoe','adrtg', 'bpm', 'obpm', 'dbpm', 'ogbpm', 'dgbpm', 'pts']

# features= selected_features

numFeatures= len(features)
rows=trainDf.shape[0]
numSets=5
perSet = int(rows/numSets)

#shuffle
trainDf = trainDf.sample(frac=1, random_state=42).reset_index(drop=True)

set1=get_training_data(trainDf, False) 
set1.info()
#set will inclide any extera entries 
trainingSets = [get_training_data(trainDf, False), get_training_data(trainDf, False), get_training_data(trainDf, False), get_training_data(trainDf, False)]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 56 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   GP                               130 non-null    int64  
 1   Min_per                          130 non-null    float64
 2   Ortg                             130 non-null    float64
 3   usg                              130 non-null    float64
 4   eFG                              130 non-null    float64
 5   TS_per                           130 non-null    float64
 6   ORB_per                          130 non-null    float64
 7   DRB_per                          130 non-null    float64
 8   AST_per                          130 non-null    float64
 9   TO_per                           130 non-null    float64
 10  FTM                              130 non-null    int64  
 11  FTA                              130 non-null    int64  
 12  FT_per                

In [72]:
def kFoldTrain(model, sets, features, target='pick'):
    prefMetrics = {'Precision': [], 'Recall': [], 'F1 Score': []}

    for j in range(len(sets)):
        valSet=sets[j]
        
        for i in range(len(sets)):
            #dont train validation set against validation set 
            if i!=j: 
            
                #remove target variable pick from the training and the testing data 
                trainSet=sets[i]
                trainNoTarget=trainSet[features]
                trainTarget=trainSet[target]
                valNoTarget=valSet[features]
                valTarget=valSet[target]
                
                #train 
                model.fit(trainNoTarget, trainTarget)
                
                predVals = model.predict(valNoTarget)
                
                prefMetrics['Precision'].append(precision_score(valTarget, predVals))
                prefMetrics['Recall'].append(recall_score(valTarget, predVals))
                prefMetrics['F1 Score'].append(f1_score(valTarget, predVals))
            
    return prefMetrics 
                

In [73]:
#citations: 
#video: https://aleksandarhaber.com/ensemble-learning-in-scikit-learn-voting-classifiers/
#Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
#build model 

#the 3 models for the ensemble 
logreg = LogisticRegression(multi_class='multinomial', max_iter=2000, random_state=42)
randFor = RandomForestClassifier(n_estimators=50, random_state=15)
svm = SVC(probability=True, random_state=1) 

# voting ensemble classifier 
model = VotingClassifier(estimators=[('lr', logreg), ('rf', randFor), ('svm', svm)], voting='hard')

trainMetrics=kFoldTrain(model, trainingSets, features)


In [74]:
pd.DataFrame(trainMetrics)


Unnamed: 0,Precision,Recall,F1 Score
0,0.820896,0.846154,0.833333
1,0.848485,0.861538,0.854962
2,0.84127,0.815385,0.828125
3,0.884058,0.938462,0.910448
4,0.898551,0.953846,0.925373
5,0.876923,0.876923,0.876923
6,0.842857,0.907692,0.874074
7,0.859155,0.938462,0.897059
8,0.850746,0.876923,0.863636
9,0.78481,0.953846,0.861111


In [75]:
#test the model 
testNoPick=testDf[features]
testPick=testDf['pick']

# Predict if player was selected for the draft on the test set
predTest = model.predict(testNoPick)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Validation/testing precision, recall and fscore 

In [24]:
# #preformance metrics for validation 
# precision = precision_score(valPick, predVals)
# recall = recall_score(valPick, predVals)
# f1 = f1_score(valPick, predVals)

# print('Precision on Validation Set:', precision)
# print('Recall on Validation Set:', recall)
# print('F1-score on Validation Set:', f1)

#preformance metrics for test set
print()
tPrecision = precision_score(testPick, predTest)
tRecall = recall_score(testPick, predTest)
tF1 = f1_score(testPick, predTest)

print('Precision on Test Set:', tPrecision)
print('Recall on Test Set:', tRecall)
print('F1-score on Test Set:', tF1)



Precision on Test Set: 0.5833333333333334
Recall on Test Set: 0.14285714285714285
F1-score on Test Set: 0.22950819672131142


## Test run pref values for cross fold and various features selected:

1) features = ['GP', 'FTA', 'porpag', 'adjoe', 'adrtg', 'bpm', 'obpm', 'dbpm', 'blk', 'pts']

![image.png](attachment:image.png)

2) features=['usg', 'DRB_per', 'AST_per', 'TO_per',  'twoP_per', 'blk_per', 'porpag','bpm', 'dbpm', 'ogbpm', 'dgbpm']

![image-2.png](attachment:image-2.png)

3) features=['porpag', 'adjoe','adrtg', 'bpm', 'obpm', 'dbpm', 'ogbpm', 'dgbpm', 'pts']

![image-4.png](attachment:image-4.png)

# Testing by training on balanced data

In [53]:
def get_training_data(dataframe, replacement, features = 13, multiplicity = 10):
    trainingCount = int(features*multiplicity/2)
    not_drafted = dataframe[dataframe['pick'] == 0].sample(n=trainingCount, replace=replacement)
    drafted = dataframe[dataframe['pick'] == 1].sample(n=trainingCount, replace=replacement)
    return pd.concat([not_drafted, drafted], ignore_index=True)

In [54]:
def trainRandSets(model, features, sets, target='pick'): 
    prefMetrics = {'Precision': [], 'Recall': [], 'F1 Score': []}

    for data in sets:
        y = data[target]
        X = data.drop(target, axis=1) 
        
        trainTarget, valTarget, trainNoTarget, valNoTarget = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train
        model.fit(trainNoTarget, trainTarget)  

        predVals = model.predict(valNoTarget)

        prefMetrics['Precision'].append(precision_score(valTarget, predVals))
        prefMetrics['Recall'].append(recall_score(valTarget, predVals))
        prefMetrics['F1 Score'].append(f1_score(valTarget, predVals))

    return prefMetrics

In [55]:
def shuffle(): 
    return trainDf.sample(frac=1, random_state=42).reset_index(drop=True)

set1=get_training_data(trainDf, False, len(features))
shuffle()
set2=get_training_data(trainDf, False, len(features))
shuffle()
set3=get_training_data(trainDf, False, len(features))
shuffle()
set4=get_training_data(trainDf, False, len(features))
shuffle()
set5=get_training_data(trainDf, False, len(features))

trainingSets=[set1, set2, set3, set4, set5]

In [1]:
logreg = LogisticRegression(multi_class='multinomial', max_iter=2000, random_state=42)
randFor = RandomForestClassifier(n_estimators=50, random_state=15)
svm = SVC(probability=True, random_state=1) 

# voting ensemble classifier 
model2 = VotingClassifier(estimators=[('lr', logreg), ('rf', randFor), ('svm', svm)], voting='hard')

#recycle the kfold train method since it's still just testing all the data 
trainMetrics=trainRandSets(model2,features, trainingSets)


NameError: name 'LogisticRegression' is not defined