Import Libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

Read Data

In [None]:
df_train = pd.read_csv("2021_stats.csv", delimiter=",")
df_train[0:5]

In [None]:
df_train.corr(method='pearson')

In [None]:
df_test = pd.read_csv("2022_stats.csv", delimiter=",")
df_test[0:5]

Pre-processing

In [None]:
df_train = df_train.drop(labels=['Rk','Player','Pos','Tm','Age','G','GS','3P','3PA','3P%','FT%','ORB','BLK','PF'],axis=1)
df_train = df_train.fillna(0)
df_train[0:5]

In [None]:
df_test = df_test.drop(labels=['Rk','Player','Pos','Tm','Age','G','GS','3P','3PA','3P%','FT%','ORB','BLK','PF'],axis=1)
df_test = df_test.fillna(0)
df_test[0:5]

In [None]:
X_train = df_train[['MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'DRB', 'TRB', 'AST', 'STL', 'TOV', 'PTS']].values
le_pos = preprocessing.LabelEncoder()
X_train[0:5]

In [None]:
y_train = df_train["isAllNBA"]
y_train [0:5]

In [None]:
X_test = df_test[['MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'DRB', 'TRB', 'AST', 'STL', 'TOV', 'PTS']].values
le_pos2 = preprocessing.LabelEncoder()
X_test[0:5]

In [None]:
y_test = df_test["isAllNBA"]
y_test [0:5]

In [None]:
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X_train, y_train)

Modeling

In [None]:
decTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4, random_state = 42)
decTree # it shows the default parameters

In [None]:
decTree.fit(X_train,y_train)

Prediction

In [None]:
predTree = decTree.predict(X_test)

In [None]:
predTree[0:150]

Evaluation

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, predTree, average='weighted') 

In [None]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, predTree,pos_label=0)

Visualization

In [None]:
from  io import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

In [None]:
dot_data = StringIO()
filename = "dectree.png"
featureNames = df_test.columns[0:17]
out=tree.export_graphviz(decTree,feature_names=featureNames, out_file=dot_data, class_names= ["0","1"], filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predTree, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predTree))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
decTree2 = DecisionTreeClassifier(criterion="entropy", max_depth = 4, random_state = 42)
decTree2.fit(X_res,y_res)
predTree2 = decTree2.predict(X_test)

In [None]:
predTree2[0:150]

In [None]:
f1_score(y_test, predTree2, average='weighted') 

In [None]:
jaccard_score(y_test, predTree2,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predTree2, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predTree2))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
dot_data = StringIO()
filename = "dectree.png"
featureNames = df_test.columns[0:17]
out=tree.export_graphviz(decTree2,feature_names=featureNames, out_file=dot_data, class_names= ["0","1"], filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

In [None]:
decTree3 = DecisionTreeClassifier(criterion="gini", max_depth = 4, random_state = 42)
decTree3.fit(X_train,y_train)
predTree3 = decTree3.predict(X_test)

In [None]:
predTree3[0:150]

In [None]:
f1_score(y_test, predTree3, average='weighted') 

In [None]:
jaccard_score(y_test, predTree3,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predTree3, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predTree3))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
dot_data = StringIO()
filename = "dectree.png"
featureNames = df_test.columns[0:17]
out=tree.export_graphviz(decTree3,feature_names=featureNames, out_file=dot_data, class_names= ["0","1"], filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

In [None]:
decTree4 = DecisionTreeClassifier(criterion="gini", max_depth = 4, random_state = 42)
decTree4.fit(X_res,y_res)
predTree4 = decTree4.predict(X_test)

In [None]:
predTree4[0:150]

In [None]:
f1_score(y_test, predTree4, average='weighted') 

In [None]:
jaccard_score(y_test, predTree4,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predTree4, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predTree3))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
dot_data = StringIO()
filename = "dectree.png"
featureNames = df_test.columns[0:17]
out=tree.export_graphviz(decTree4,feature_names=featureNames, out_file=dot_data, class_names= ["0","1"], filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train) 

In [None]:
yhat = clf.predict(X_test)
yhat [0:5]

In [None]:
yhat[0:150]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, yhat)

In [None]:
f1_score(y_test, yhat, average='weighted') 

In [None]:
jaccard_score(y_test, yhat,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
clf2 = svm.SVC(kernel='rbf')
clf2.fit(X_res, y_res) 

In [None]:
yhat2 = clf2.predict(X_test)
yhat2 [0:5]

In [None]:
yhat2[0:150]

In [None]:
accuracy_score(y_test, yhat2)

In [None]:
f1_score(y_test, yhat2, average='weighted') 

In [None]:
jaccard_score(y_test, yhat2,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat2, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat2))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipelineSVC = make_pipeline(StandardScaler(), SVC(random_state=1))
#
# Create the parameter grid
#
param_grid_svc = [{
                    'svc__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
                    'svc__kernel': ['linear']
                  },
                 {
                    'svc__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
                    'svc__gamma': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
                    'svc__kernel': ['rbf']
                 }]
#
# Create an instance of GridSearch Cross-validation estimator
#
gsSVC = GridSearchCV(estimator=pipelineSVC,
                     param_grid = param_grid_svc,
                     scoring='accuracy',
                     cv=10,
                     refit=True,
                     n_jobs=1)
#
# Train the SVM classifier
#
gsSVC.fit(X_train, y_train)
#
# Print the training score of the best model
#
print(gsSVC.best_score_)
#
# Print the model parameters of the best model
#
print(gsSVC.best_params_)

In [None]:
yhat3 = gsSVC.predict(X_test)
yhat3 [0:5]

In [None]:
yhat3[0:150]

In [None]:
accuracy_score(y_test, yhat3)

In [None]:
f1_score(y_test, yhat3, average='weighted') 

In [None]:
jaccard_score(y_test, yhat3,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat3, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat3))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
gsSVC2 = GridSearchCV(estimator=pipelineSVC,
                     param_grid = param_grid_svc,
                     scoring='accuracy',
                     cv=10,
                     refit=True,
                     n_jobs=1)
#
# Train the SVM classifier
#
gsSVC2.fit(X_res, y_res)
#
# Print the training score of the best model
#
print(gsSVC2.best_score_)
#
# Print the model parameters of the best model
#
print(gsSVC2.best_params_)

In [None]:
yhat4 = gsSVC2.predict(X_test)
yhat4 [0:5]

In [None]:
yhat4[0:150]

In [None]:
accuracy_score(y_test, yhat4)

In [None]:
f1_score(y_test, yhat4, average='weighted') 

In [None]:
jaccard_score(y_test, yhat4,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat4, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat4))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64, ), activation='relu',max_iter=1000, epsilon=1e-08, random_state = 42)

In [None]:
from sklearn.metrics import accuracy_score
mlp.fit(X_train, y_train)
predANN = mlp.predict(X_test)

In [None]:
predANN[0:150]

In [None]:
f1_score(y_test, predANN, average='weighted') 

In [None]:
jaccard_score(y_test, predANN,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predANN, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predANN))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
mlp2 = MLPClassifier(hidden_layer_sizes=(64, ), activation='relu',max_iter=1000, epsilon=1e-08, random_state = 42)
mlp2.fit(X_res, y_res)
predANN2 = mlp2.predict(X_test)

In [None]:
predANN2[0:150]

In [None]:
f1_score(y_test, predANN2, average='weighted') 

In [None]:
jaccard_score(y_test, predANN2,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predANN2, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predANN2))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
predLR = LR.predict(X_test)

In [None]:
predLR[0:150]

In [None]:
f1_score(y_test, predLR, average='weighted') 

In [None]:
jaccard_score(y_test, predLR,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predLR, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predLR))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
LR2 = LogisticRegression(C=0.01, solver='liblinear').fit(X_res,y_res)
predLR2 = LR2.predict(X_test)

In [None]:
predLR2[0:150]

In [None]:
f1_score(y_test, predLR2, average='weighted') 

In [None]:
jaccard_score(y_test, predLR,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predLR2, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predLR2))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
pipelineLR = make_pipeline(StandardScaler(), LogisticRegression(random_state=1, penalty='l2', solver='lbfgs'))
#
# Create the parameter grid
#
param_grid_lr = [{
    'logisticregression__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
}]
#
# Create an instance of GridSearch Cross-validation estimator
#
gsLR = GridSearchCV(estimator=pipelineLR,
                     param_grid = param_grid_lr,
                     scoring='accuracy',
                     cv=10,
                     refit=True,
                     n_jobs=1)
#
# Train the LogisticRegression Classifier
#
gsLR = gsLR.fit(X_train, y_train)
#
# Print the training score of the best model
#
print(gsLR.best_score_)
#
# Print the model parameters of the best model
#
print(gsLR.best_params_)

In [None]:
predgsLR = gsLR.predict(X_test)

In [None]:
predgsLR[0:150]

In [None]:
f1_score(y_test, predgsLR, average='weighted') 

In [None]:
jaccard_score(y_test, predgsLR,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predgsLR, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predgsLR))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')

In [None]:
pipelineLR = make_pipeline(StandardScaler(), LogisticRegression(random_state=1, penalty='l2', solver='lbfgs'))
#
# Create the parameter grid
#
param_grid_lr = [{
    'logisticregression__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
}]
#
# Create an instance of GridSearch Cross-validation estimator
#
gsLR2 = GridSearchCV(estimator=pipelineLR,
                     param_grid = param_grid_lr,
                     scoring='accuracy',
                     cv=10,
                     refit=True,
                     n_jobs=1)
#
# Train the LogisticRegression Classifier
#
gsLR2 = gsLR2.fit(X_res, y_res)
#
# Print the training score of the best model
#
print(gsLR2.best_score_)
#
# Print the model parameters of the best model
#
print(gsLR2.best_params_)

In [None]:
predgsLR2 = gsLR2.predict(X_test)

In [None]:
predgsLR2[0:150]

In [None]:
f1_score(y_test, predgsLR2, average='weighted') 

In [None]:
jaccard_score(y_test, predgsLR2,pos_label=0)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, predgsLR2, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, predgsLR2))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not All-NBA(0)','All-NBA(1)'],normalize= False,  title='Confusion matrix')