### Brainstorming
Given features:
- players
- outcome (1)
- teams?
- lineup score

Player lineups give different scores -> used as an input

https://sportsdata.io/developers/fantasy-scoring-system/nba

### Import Libraries and Data

In [4]:
import pandas as pd
import numpy as np
import glob
import joblib
import random
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, brier_score_loss, label_ranking_average_precision_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [6]:
filepaths = glob.glob("matchups-*.csv")
data = pd.concat([pd.read_csv(file) for file in filepaths], ignore_index=True)

In [7]:
x_columns = [
    'home_team', 'away_team', 'season', 'starting_min',
    'home_0','home_1','home_2','home_3','home_4',
    'away_0','away_1','away_2','away_3','away_4',
    'outcome'
]
rawX = data.filter(items=x_columns, axis=1)

In [8]:
classes = {}
def encode_column(column, encoders, X, rawX):
    if column == 'season' or column == 'starting_min'or column == 'outcome':
        X[column] = rawX[column]
    elif column == 'home_0' or column == 'home_1' or column == 'home_2' or column == 'home_3' or column == 'home_4':
        encoder = LabelEncoder()
        encoder.fit(rawX[column])
        classes[column] = encoder.classes_

        # Add ? value to encoding for test data
        encoder.classes_ = np.array(list(encoder.classes_) + ['?'])
        
        X[column] = encoder.transform(rawX[column])
        encoders[column] = encoder
    else:
        encoder = LabelEncoder()
        X[column] = encoder.fit_transform(rawX[column])
        encoders[column] = encoder

X = pd.DataFrame()
encoders = {}
for column in x_columns:
    encode_column(column, encoders, X, rawX)
    
print(X)
for index, label in enumerate(encoders['home_team'].classes_):
    print(f"{label} -> {index}")

        home_team  away_team  season  starting_min  home_0  home_1  home_2  \
0              14         27    2007             0      51     556     527   
1              14         27    2007             6      51     556     527   
2              14         27    2007             8     555     587     576   
3              14         27    2007            10     555     587     576   
4              14         27    2007            11     575     633     719   
...           ...        ...     ...           ...     ...     ...     ...   
236907         23         26    2015            35     437     714     702   
236908         23         26    2015            39     437     560     673   
236909         23         26    2015            40     437     560     673   
236910         23         26    2015            42      46     414     496   
236911         23         26    2015            45      46     414     496   

        home_3  home_4  away_0  away_1  away_2  away_3  away_4 

In [76]:
clf0 = RandomForestClassifier(max_depth=10, random_state=0)
clf1 = RandomForestClassifier(max_depth=10, random_state=0)
clf2 = RandomForestClassifier(max_depth=10, random_state=0)
clf3 = RandomForestClassifier(max_depth=10, random_state=0)
clf4 = RandomForestClassifier(max_depth=10, random_state=0)

Y0 = data['home_0']
Y1 = data['home_1']
Y2 = data['home_2']
Y3 = data['home_3']
Y4 = data['home_4']

X0 = X.drop(columns=['home_0'])
X1 = X.drop(columns=['home_1'])
X2 = X.drop(columns=['home_2'])
X3 = X.drop(columns=['home_3'])
X4 = X.drop(columns=['home_4'])

### Train Classifiers

In [56]:
clf0.fit(X0, Y0)

In [66]:
_ = joblib.dump(clf0, 'models/clf0.joblib.pkl', compress=9)

In [57]:
clf1.fit(X1, Y1)

In [67]:
_ = joblib.dump(clf1, 'models/clf1.joblib.pkl', compress=9)

In [58]:
clf2.fit(X2, Y2)

In [68]:
_ = joblib.dump(clf2, 'models/clf2.joblib.pkl', compress=9)

In [59]:
clf3.fit(X3, Y3)

In [69]:
_ = joblib.dump(clf3, 'models/clf3.joblib.pkl', compress=9)

In [60]:
clf4.fit(X4, Y4)

In [70]:
_ = joblib.dump(clf4, 'models/clf4.joblib.pkl', compress=9)

### Evaluate with Classifiers

In [6]:
def handle_unknown_value(encoder, value):
    encoder.classes_ = np.array(list(encoder.classes_) + [value])
    return encoder.transform([value])[0]

In [53]:
#Import test data
test_data = pd.read_csv(f"NBA_test.csv")
test_X = pd.DataFrame()

#Apply label encoding to columns
for column in x_columns:
    if column == 'outcome':
        continue
    if column == 'starting_min' or column == 'season':
        test_X[column] = test_data[column]
        continue
    else:
        test_X[column] = [encoders[column].transform([x])[0] if x in encoders[column].classes_ else handle_unknown_value(encoders[column], x) for x in test_data[column]]

#Add "outcome" column with value 1 (assume winning value)
test_X['outcome'] = '1'

In [54]:
print(encoders['home_0'].transform(['?'])[0])
print(encoders['home_1'].transform(['?'])[0])
print(encoders['home_2'].transform(['?'])[0])
print(encoders['home_3'].transform(['?'])[0])
print(encoders['home_4'].transform(['?'])[0])
print(test_X)

650
775
796
779
660
     home_team  away_team  season  starting_min  home_0  home_1  home_2  \
0           12          1    2007            18     215     194     451   
1           11          7    2007            16     102     775     442   
2           30         28    2007            39      89     775      48   
3           18          1    2007            21     650     527     685   
4           15         14    2007            19     174     336     599   
..         ...        ...     ...           ...     ...     ...     ...   
995          2          5    2016            20     650     452     778   
996         32          5    2016            21     241     211     796   
997          7          9    2016            13     234     775     180   
998         22         25    2016            24      61     285     796   
999          2         23    2016            20     123     246     387   

     home_3  home_4  away_0  away_1  away_2  away_3  away_4 outcome  
0       5

In [55]:
#Import models
clf0 = joblib.load('models/clf0.joblib.pkl')
clf1 = joblib.load('models/clf1.joblib.pkl')
clf2 = joblib.load('models/clf2.joblib.pkl')
clf3 = joblib.load('models/clf3.joblib.pkl')
clf4 = joblib.load('models/clf4.joblib.pkl')

In [56]:
player_data = pd.read_csv('player_data.csv')

In [57]:
placeholders = []
placeholders.append(encoders['home_0'].transform(['?'])[0])
placeholders.append(encoders['home_1'].transform(['?'])[0])
placeholders.append(encoders['home_2'].transform(['?'])[0])
placeholders.append(encoders['home_3'].transform(['?'])[0])
placeholders.append(encoders['home_4'].transform(['?'])[0])

def decide_class(predictions : np.ndarray, missing_feature) -> pd.Series:
    preds = pd.Series(data=predictions[0], index=classes[missing_feature])
    scores = {}
    for index, value in preds.items():
        try:
            scores[index] = 0.7 * value + 0.3 * player_data.at[index, "score"]/player_data.at[index, "total_games"]
        except KeyError:
            scores[index] = 0.7 * value
    return pd.Series(scores).idxmax() # return max

def evaluate(row : pd.Series) -> pd.Series:
    if row['home_0'] == placeholders[0]:
        return decide_class(clf0.predict_proba([row.drop(labels='home_0')]), 'home_0')
    elif row['home_1'] == placeholders[1]:
        return decide_class(clf1.predict_proba([row.drop(labels='home_1')]), 'home_1')
    elif row['home_2'] == placeholders[2]:
        return decide_class(clf2.predict_proba([row.drop(labels='home_2')]), 'home_2')
    elif row['home_3'] == placeholders[3]:
        return decide_class(clf3.predict_proba([row.drop(labels='home_3')]), 'home_3')
    elif row['home_4'] == placeholders[4]:
        return decide_class(clf4.predict_proba([row.drop(labels='home_4')]), 'home_4')
    else:
        print(row)
        raise Exception("Invalid row provided.")

In [58]:
predictions = test_X.apply(evaluate, axis=1)



In [59]:
print(predictions)

0            Roy Hibbert
1             Chris Paul
2        Darren Collison
3             Chris Bosh
4          Zach Randolph
             ...        
995         Courtney Lee
996    Jonas Valanciunas
997         Devin Harris
998      Harrison Barnes
999         Vince Carter
Length: 1000, dtype: object


### Cross Validation

In [7]:
# Initialize K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
clf0 = RandomForestClassifier(max_depth=10, random_state=0)
clf1 = RandomForestClassifier(max_depth=10, random_state=0)
clf2 = RandomForestClassifier(max_depth=10, random_state=0)
clf3 = RandomForestClassifier(max_depth=10, random_state=0)
clf4 = RandomForestClassifier(max_depth=10, random_state=0)

In [9]:
accuracy_scores = []

In [20]:
# UPDATE TEST DATA TO INCLUDE ?s
def add_placeholders(test_X):
    for index, row in test_X.iterrows():
        n = random.randint(0, 4)
        player_index = f"home_{n}"
        row[player_index] = placeholders[n]

In [21]:
# Perform K-Fold Cross-Validation
def kfold_validate(train_index, test_index) :
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    add_placeholders(X_test)
    
    Y0 = X_train['home_0']
    Y1 = X_train['home_1']
    Y2 = X_train['home_2']
    Y3 = X_train['home_3']
    Y4 = X_train['home_4']
    
    X0 = X_train.drop(columns=['home_0'])
    X1 = X_train.drop(columns=['home_1'])
    X2 = X_train.drop(columns=['home_2'])
    X3 = X_train.drop(columns=['home_3'])
    X4 = X_train.drop(columns=['home_4'])
    
    clf0.fit(X0, Y0)
    clf1.fit(X1, Y1)
    clf2.fit(X2, Y2)
    clf3.fit(X3, Y3)
    clf4.fit(X4, Y4)
    
    y_test = X_test.apply(evaluate, axis=1)
    acc = accuracy_score(y_test, predictions)
    accuracy_scores.append(acc)

In [22]:
indeces = []
for train_index, test_index in kf.split(X):
    indeces.append([train_index, test_index])

In [23]:
kfold_validate(indeces[0][0], indeces[0][1])

home_team         14
away_team         27
season          2007
starting_min      11
home_0           575
home_1           633
home_2           719
home_3           688
home_4           631
away_0           103
away_1           388
away_2           493
away_3           412
away_4           302
outcome           -1
Name: 4, dtype: int64


Exception: Invalid row provided.

In [None]:
kfold_validate(indeces[1][0], indeces[1][1])

In [None]:
kfold_validate(indeces[2][0], indeces[2][1])

In [None]:
kfold_validate(indeces[3][0], indeces[3][1])

In [None]:
kfold_validate(indeces[4][0], indeces[4][1])

In [None]:
print(f'Accuracy Scores for each fold: {accuracy_scores}')
print(f'Average Accuracy: {np.mean(accuracy_scores):.4f}')

In [None]:
# Align feature columns to match training data
X_test = X_test[X.columns]

In [None]:
# Evaluate Model
for index, row in X_test.iterrows():
    evaluate(row, X_test.iloc[[index]])

### Separate Performance Evaluation

In [None]:
# test_df -> NBA_test.csv
# split test_df -> 5 separate dataframes 
#-> each contains all entries where Xi = '?'

# clf0.evaluate(df_test_0) -> tests on only data where 'home_0' = '?'
# clf1.evaluate(df_test_1)
# ...

In [9]:
def handle_unknown_value(encoder, value):
    encoder.classes_ = np.array(list(encoder.classes_) + [value])
    return encoder.transform([value])[0]
# UPDATE TEST DATA TO INCLUDE ?s
def add_placeholders(test_X):
    for index, row in test_X.iterrows():
        n = random.randint(0, 4)
        player_index = f"home_{n}"
        row[player_index] = placeholders[n]

In [22]:
def decide_class(predictions : np.ndarray, missing_feature) -> pd.Series:
    preds = pd.Series(data=predictions[0], index=classes[f'home_{missing_feature}'])
    scores = {}
    for index, value in preds.items():
        try:
            scores[index] = 0.7 * value + 0.3 * player_data.at[index, "score"]/player_data.at[index, "total_games"]
        except KeyError:
            scores[index] = 0.7 * value
    return pd.Series(scores).idxmax() # return max

def evaluate(row : pd.Series) -> pd.Series:
    if row['home_0'] == placeholders[0]:
        return decide_class(clf0.predict_proba([row.drop(labels='home_0')]), 0)
    elif row['home_1'] == placeholders[1]:
        return decide_class(clf1.predict_proba([row.drop(labels='home_1')]), 1)
    elif row['home_2'] == placeholders[2]:
        return decide_class(clf2.predict_proba([row.drop(labels='home_2')]), 2)
    elif row['home_3'] == placeholders[3]:
        return decide_class(clf3.predict_proba([row.drop(labels='home_3')]), 3)
    elif row['home_4'] == placeholders[4]:
        return decide_class(clf4.predict_proba([row.drop(labels='home_4')]), 4)
    else:
        print(row)
        raise Exception("Invalid row provided.")

In [47]:
test_labels = pd.read_csv("NBA_test_labels.csv")

In [43]:
#Import test data
test_data = pd.read_csv(f"NBA_test.csv")
test_X = pd.DataFrame()

#Apply label encoding to columns
for column in x_columns:
    if column == 'outcome':
        continue
    if column == 'starting_min' or column == 'season':
        test_X[column] = test_data[column]
        continue
    else:
        test_X[column] = [encoders[column].transform([x])[0] if x in encoders[column].classes_ else handle_unknown_value(encoders[column], x) for x in test_data[column]]

#Add "outcome" column with value 1 (assume winning value)
test_X['outcome'] = '1'

placeholders = []
placeholders.append(encoders['home_0'].transform(['?'])[0])
placeholders.append(encoders['home_1'].transform(['?'])[0])
placeholders.append(encoders['home_2'].transform(['?'])[0])
placeholders.append(encoders['home_3'].transform(['?'])[0])
placeholders.append(encoders['home_4'].transform(['?'])[0])

KeyboardInterrupt: 

In [50]:
predictions = test_X.apply(evaluate, axis=1)



ValueError: X has 12 features, but RandomForestClassifier is expecting 14 features as input.

In [69]:
print(pd.DataFrame(predictions, columns=['a']))
print(pd.DataFrame(test_labels, columns=['a']))
p = pd.concat([pd.DataFrame(predictions, columns=['a']), pd.DataFrame(test_labels, columns=['a'])], axis=0)
print(p)
encoder = LabelEncoder()
encoder.fit(p['a'])
test_labels = encoder.transform(test_labels)
predictions = encoder.transform(predictions)
acc = accuracy_score(test_labels, predictions)
err = mean_squared_error(test_labels, predictions)

       a
0    401
1     79
2    102
3     75
4    484
..   ...
995   88
996  231
997  122
998  173
999  474

[1000 rows x 1 columns]
       a
0    463
1     84
2     54
3     89
4    435
..   ...
995  127
996  231
997  122
998  181
999  476

[1000 rows x 1 columns]
       a
0    401
1     79
2    102
3     75
4    484
..   ...
995  127
996  231
997  122
998  181
999  476

[2000 rows x 1 columns]


In [70]:
print(acc)
print(err)

0.309
4260.106
