# Independent Work Project
The goal of this project is to use methods in data science to predict the probability that every player in the first round of the 2019 NBA draft makes an All-NBA team throughout their careers.

### Anaconda
We will use the Anaconda platform for Python for this project

In [433]:
%matplotlib inline

# def warn(*args, **kwargs):
#     pass

# import warnings
# warnings.warn = warn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
# Random Forest

oversample = False

### Loading and preparing the data for analysis
The data consists of the college basketball statistics of all first round picks who have played in the NCAA, since the 1990 NBA Draft. The columns consist of:
* Points per game
* Rebounds per game
* Assists per game
* Steals per game
* etc.

In [434]:
# Loading data as Pandas dataframe
df = pd.read_csv('data.csv', header=0)
print(df.head())
df = df._get_numeric_data()
headers = list(df.columns)

dataset = df.to_numpy()

y_i = headers.index('All-NBA')
    
# Separate data into features and target
x = dataset[:, 0:y_i]
y = dataset[:, y_i]

               Player  Pick #   RPG  APG  SPG  BPG   PPG    SOS    TS%    FTr  \
0     Derrick Coleman       1  12.1  2.9  1.5  2.0  17.9   8.85  0.620  0.747   
1         Gary Payton       2   4.7  8.1  3.4  0.5  25.7   6.91  0.572  0.299   
2  Mahmoud Abdul-Rauf       3   2.5  3.2  1.6  0.0  27.8   7.61  0.584  0.317   
3        Dennis Scott       4   6.6  2.0  1.8  0.9  27.7  10.33  0.593  0.281   
4        Kendall Gill       5   4.9  3.3  2.2  0.6  20.0   9.89  0.575  0.415   

   Height  Weight  Age  All-NBA  
0      82     230   23        1  
1      76     180   22        1  
2      73     162   21        0  
3      80     229   22        0  
4      77     195   22        0  


In [435]:
# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=0)

if oversample:
    sm = SMOTE()
    x_train, y_train = sm.fit_sample(x_train, y_train.ravel())

print('Shape of training dataset:', x_train.shape)
print('Shape of testing dataset:', x_test.shape)

Shape of training dataset: (333, 12)
Shape of testing dataset: (112, 12)


# Predictive Models
We will use the following models for the following reasons...

## 1. Logistic Regression Model
Our first model will be a logistic regression model, using the default sklearn parameters.


In [436]:
log_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()),
    ('LOG', LogisticRegression())
])

features = [2, 4, 8]
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), RFE(LogisticRegression())],
#         'reduce_dim__n_components': features,
        'LOG__solver': ['liblinear'],
        'LOG__penalty': ['l1','l2'],
        'LOG__C': C
    }
#     {
#         'reduce_dim': [SelectKBest(chi2)],
#         'reduce_dim__k': features,
#         'LOG__solver': ['liblinear'],
#         'LOG__penalty': ['l1','l2'],
#         'LOG__C': C
#     },
]

model_log = GridSearchCV(log_pipeline, param_grid=param_grid, cv=5)

TypeError: __init__() missing 1 required positional argument: 'estimator'

## 2. Random Forest Classifier


In [None]:
rfc_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('RFC', RandomForestClassifier())
])

grid_values = {
    'RFC__max_features': ['auto', 'sqrt'],
    'RFC__max_depth': [5, 10, 15, 20],
    'RFC__n_estimators': [80, 90, 100, 110, 120]
}

model_rfc = GridSearchCV(rfc_pipeline, param_grid=grid_values, cv=5)

# Evaluating the Models
We will now look at a few methods of evaluating the models we've created

In [None]:
# Evaluation method
def evaluate(model, model_name):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    # Accuracy, recall, precision and F1
    print('Accuracy of "%s" model: %.3f' % (model_name, metrics.accuracy_score(y_test, pred)))
    print('Recall of "%s" model: %.3f' % (model_name, metrics.recall_score(y_test, pred)))
    print('Precision of "%s" model: %.3f' % (model_name, metrics.precision_score(y_test, pred)))
    print('F1 of "%s" model: %.3f' % (model_name, metrics.f1_score(y_test, pred)))

    # Confusion matrix
    xlabels=['Predicted 0', 'Predicted 1']
    ylabels=['Actual 0', 'Actual 1']
    
    cm = confusion_matrix(y_test, pred)
    df_cm = pd.DataFrame(cm, range(2), range(2))
    ax = sn.heatmap(df_cm, square=True, annot=True, xticklabels=xlabels, yticklabels=ylabels)
    ax.set_ylim(2, 0) # workaround for cut-off bug
    plt.show()
    
    # ROC Curve
    probs = model.predict_proba(x_test)[:,1]
    fpr, tpr, threshold = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)

    plt.title('Receiver Operating Characteristic - %s' % model_name)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

### Dummy Classifier
A dummy classifier assigns labels randomly, according to the distribution of classes in the training set

In [None]:
dummy = DummyClassifier(strategy='stratified')

In [None]:
evaluate(dummy, 'Dummy')

In [None]:
evaluate(model_log, 'Logistic Regression')
print(model_log.best_params_)

In [None]:
evaluate(model_rfc, 'Random Forest')

In [None]:
print(model_rfc.best_params_)

# Results on 2019 Rookies

In [None]:
# Loading data as Pandas dataframe
df = pd.read_csv('rookies.csv', header=0)
stats = df._get_numeric_data()
rookies = stats.to_numpy()

def pred_rookies(model, model_name):
    pred = model.predict_proba(rookies)

    print('Predicted probabilities (%s)' % model_name)
    print('==================================')
    for i in range(len(pred)):
        print('%-28s %.3f ' % (df['Player'][i], pred[i][1]))

In [None]:
pred_rookies(model_log, "Logistic")
pred_rookies(model_rfc, "RFC")