Merge draft datasets

In [16]:
import pandas as pd

draftnet_f = '../data/nbadraft_strengths_weaknesses.csv'
alltimedraft_f = '../data/alltime_draft.csv'

draftnet_df = pd.read_csv(draftnet_f)
alltimedraft_df = pd.read_csv(alltimedraft_f)

# for getting a column to join on
alltimedraft_df['player'] = alltimedraft_df.PLAYER_NAME.str.lower().str.replace(' ','-')

In [21]:
keep_cols = ['PERSON_ID','PLAYER_NAME','SEASON','strengths','weaknesses', 'overall','Athleticism','Size','Defense','Strength','Quickness',
    'Leadership','JumpShot','NBAReady','Rebounding', 'Potential','PostSkills','Intangibles','BallHandling','Passing']
rename_dict = {'PERSON_ID':'person_id','PLAYER_NAME':'player','SEASON':'season'}
df = pd.merge(alltimedraft_df, draftnet_df, on='player', how='inner')[keep_cols]
df.rename(rename_dict, axis=1, inplace=True)
df = df.loc[df.season >= 2006]
df = df.dropna(subset=['strengths'])
df.head()

Unnamed: 0,person_id,player,season,strengths,weaknesses,overall,Athleticism,Size,Defense,Strength,Quickness,Leadership,JumpShot,NBAReady,Rebounding,Potential,PostSkills,Intangibles,BallHandling,Passing
0,1631094,Paolo Banchero,2022,"Has good size and length for his position, sta...","For all of his offensive gifts, still has room...",99,8,8,8,9,7,9,8,8,9.0,8,8.0,9,,
1,1631096,Chet Holmgren,2022,Extremely skilled frontcourt player who can in...,Lack of physical strength (195 lbs) remains hi...,97,8,9,9,6,7,8,8,8,9.0,9,8.0,8,,
2,1631099,Keegan Murray,2022,"A 6’8 225 frontcourt player with the frame, le...",Will have some questions about his true positi...,94,8,8,8,7,8,8,8,9,,7,,9,7.0,7.0
3,1631093,Jaden Ivey,2022,"An aggressive explosive, 6’4 200 lb combo guar...",Ivey’s hard charging energy and play style can...,98,9,8,8,9,9,8,7,8,,9,,8,7.0,8.0
4,1631097,Bennedict Mathurin,2022,"6’7 wing with tremendous size, maturity, explo...",The biggest hurdle for Mathurin at this point ...,97,9,8,7,9,8,9,8,8,,9,,8,7.0,7.0


Add in the all star information

In [22]:
allstar_f = '../data/nba_allstar_all.csv'
allstar_df = pd.read_csv(allstar_f)
allstar_df = allstar_df.loc[allstar_df.year >= 2001]

In [23]:
import numpy as np
all_star_indicator = []
all_star_first_year = []
for index, row in df.iterrows():
    if row['player'] in allstar_df['Player'].values:
        all_star_indicator.append(1)
        player_allstar = allstar_df.loc[allstar_df['Player'] == row['player']]
        year_min = player_allstar.year.min()
        allstar_player_year = year_min - row['season']
        all_star_first_year.append(allstar_player_year)
    else:
        all_star_indicator.append(0)
        all_star_first_year.append(np.nan)
df['allstar_bool'] = all_star_indicator
df['allstar_first_year'] = all_star_first_year
df['within7'] = (df.allstar_first_year <= 7).astype(int)
df['within5'] = (df.allstar_first_year <= 5).astype(int)
df.drop_duplicates(subset='player', inplace=True)

nlp feature extraction

within 5 years

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing Punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Joining tokens back into a single text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Get names of 2023 mock draftees

In [30]:
import requests
from bs4 import BeautifulSoup
draft_2023_url = 'https://www.nbadraft.net/nba-mock-drafts/'
response = requests.get(draft_2023_url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', id='nba_mock_consensus_table')
df2023 = pd.read_html(str(table))[0]
player_2023_list = []
for player_name in df2023.Player:
    player_2023_list.append(player_name.lower().replace(' ','-'))

Create dataframe containing the merged df and the 2023 mock draftees called all_df

In [45]:
# combine input data and 2023 data temporarily to run all through vectorizer
all_df = pd.concat([df, draftnet_df.loc[draftnet_df.player.isin(player_2023_list)]])
all_df['text'] = all_df['strengths'] + ' ' + all_df['weaknesses']
all_df['text'] = all_df['text'].apply(preprocess_text)
all_df = all_df.reset_index(drop=True)

# apply NLP vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter
input_cols = ['overall','Athleticism','Size','Defense','Strength','Quickness', 'Leadership','JumpShot','NBAReady']
X = vectorizer.fit_transform(all_df['text'])

# combine vectorized array and the other input columns
X = pd.concat([pd.DataFrame(X.toarray()), all_df[input_cols]], axis=1)
X.columns = X.columns.astype(str) # convert to all string names
# inds = np.where(all_df['draft_year'] == '2023')[0]
# X_2023 = X[inds]
# X = X[all_df['draft_year'] != '2023']

In [46]:
n = 7
if n == 5:
    mask = (all_df.within5 == 1) | (all_df.season <= 2018)
    # data = df.loc[(df.within5 == 1) | (df.season <= 2018)]
else:
    mask = (all_df.within7 == 1) | (all_df.season <= 2016)
    # data = df.loc[(df.within7 == 1) | (df.season <= 2016)]

# mask = mask & (all_df.season != 2015)
# data['text'] = data['strengths'] + ' ' + data['weaknesses']
# data['text'] = data['text'].apply(preprocess_text)

# vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter
# X = vectorizer.fit_transform(data['text'])
X_rest = X[mask]
y = all_df[mask]['within7']

In [47]:
# Step 4: Model Training
X_train, X_test, y_train, y_test = train_test_split(X_rest, y, test_size=0.2, random_state=42, stratify=y)
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
from sklearn.metrics import accuracy_score

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Model Accuracy:', accuracy)

Model Accuracy: 0.8709677419354839


In [49]:
from sklearn.model_selection import KFold, cross_validate
kf = KFold(n_splits=6, shuffle=True, random_state=2)
scoring = ['accuracy','precision','recall','f1','roc_auc','neg_log_loss']
model = LogisticRegression()
scores = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring)

for metric in scoring:
    metric_scores = scores[f'test_{metric}']
    print(f'{metric}: {metric_scores}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit

accuracy: [0.91935484 0.77419355 0.90322581 0.87096774 0.87096774 0.91803279]
precision: [0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0.]
f1: [0. 0. 0. 0. 0. 0.]
roc_auc: [0.63508772 0.58928571 0.45238095 0.5787037  0.65972222 0.575     ]
neg_log_loss: [-0.27296492 -0.58774047 -0.35383363 -0.3909283  -0.35983756 -0.29634134]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.utils import class_weight

# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

param_grid = {
    'C': [5, 10, 15, 20],
    'penalty': ['l1','l2'],
    'solver': ['liblinear'],
    'class_weight': [None, class_weights],
    'max_iter': [1000]
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression()
grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='accuracy')

grid_search.fit(X_train, y_train)

print('Best hyperparameters: ', grid_search.best_params_)
print('Best Score: ', grid_search.best_score_)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Andy\anaconda3\envs\nba-stats\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Andy\anaconda3\envs\nba-stats\lib\site-packages\sklearn\linear_model\_logistic.py", line 1160, in fit
    self._validate_params()
  File "c:\Users\Andy\anaconda3\envs\nba-stats\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Andy\anaconda3\envs\nba-stats\lib\site-packages\sklearn\utils\_param_validation.py", line

Best hyperparameters:  {'C': 10, 'class_weight': None, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score:  0.8814414414414415


Now try it out on this year's draft class

In [53]:
# data = draftnet_df.loc[draftnet_df.draft_year == '2023']
# data['text'] = data['strengths'] + ' ' + data['weaknesses']
# data['text'] = data['text'].apply(preprocess_text)

# mask = all_df['draft_year'] == '2023'
# inds = np.where(all_df['draft_year'] == '2023')[0]
# X_2023 = X[mask]

# vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter
# tmp_X = vectorizer.fit_transform(data['text'])
# tmp_X = 

model = LogisticRegression(**grid_search.best_params_)
model.fit(X_rest, y)
# y_pred = model.predict(data['text'])

In [54]:
# mask = all_df['season'] == 2015
# mask = all_df['draft_year'] == '2023'
mask = all_df.player.isin(player_2023_list)
X_2023 = X[mask]
y_pred_prob = model.predict_proba(X_2023)

In [56]:
df_2023 = all_df[mask]
df_2023['allstar_prob'] = [x[1] * 100 for x in y_pred_prob]
df_2023 = df_2023[['player','allstar_prob']]
df_2023.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['allstar_prob'] = [x[1] * 100 for x in y_pred_prob]


Unnamed: 0,player,allstar_prob
713,amen-thompson,12.726752
714,anthony-black,34.590731
715,ausar-thompson,1.529834
716,bilal-coulibaly,36.146948
717,bobi-klintman,16.676989
