### Brainstorming
Given features:
- players
- outcome (1)
- teams?
- lineup score

Player lineups give different scores -> used as an input

https://sportsdata.io/developers/fantasy-scoring-system/nba

### Import Libraries and Data

In [2]:
import pandas as pd
import numpy as np
import glob
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [3]:
filepaths = glob.glob("matchups-*.csv")
data = pd.concat([pd.read_csv(file) for file in filepaths], ignore_index=True)

In [4]:
x_columns = [
    'home_team', 'away_team', 'season', 'starting_min',
    'home_0','home_1','home_2','home_3','home_4',
    'away_0','away_1','away_2','away_3','away_4',
    'outcome'
]
rawX = data.filter(items=x_columns, axis=1)

In [65]:
classes = {}
def encode_column(column, encoders, X, rawX):
    if column == 'season' or column == 'starting_min'or column == 'outcome':
        X[column] = rawX[column]
    elif column == 'home_0' or column == 'home_1' or column == 'home_2' or column == 'home_3' or column == 'home_4':
        encoder = LabelEncoder()
        encoder.fit(rawX[column])
        classes[column] = encoder.classes_

        # Add ? value to encoding for test data
        encoder.classes_ = np.array(list(encoder.classes_) + ['?'])
        
        X[column] = encoder.transform(rawX[column])
        encoders[column] = encoder
    else:
        encoder = LabelEncoder()
        X[column] = encoder.fit_transform(rawX[column])
        encoders[column] = encoder

X = pd.DataFrame()
encoders = {}
for column in x_columns:
    encode_column(column, encoders, X, rawX)
    
print(X)
for index, label in enumerate(encoders['home_team'].classes_):
    print(f"{label} -> {index}")

        home_team  away_team  season  starting_min  home_0  home_1  home_2  \
0              14         27    2007             0      51     556     527   
1              14         27    2007             6      51     556     527   
2              14         27    2007             8     555     587     576   
3              14         27    2007            10     555     587     576   
4              14         27    2007            11     575     633     719   
...           ...        ...     ...           ...     ...     ...     ...   
236907         23         26    2015            35     437     714     702   
236908         23         26    2015            39     437     560     673   
236909         23         26    2015            40     437     560     673   
236910         23         26    2015            42      46     414     496   
236911         23         26    2015            45      46     414     496   

        home_3  home_4  away_0  away_1  away_2  away_3  away_4 

In [76]:
clf0 = RandomForestClassifier(max_depth=10, random_state=0)
clf1 = RandomForestClassifier(max_depth=10, random_state=0)
clf2 = RandomForestClassifier(max_depth=10, random_state=0)
clf3 = RandomForestClassifier(max_depth=10, random_state=0)
clf4 = RandomForestClassifier(max_depth=10, random_state=0)

Y0 = data['home_0']
Y1 = data['home_1']
Y2 = data['home_2']
Y3 = data['home_3']
Y4 = data['home_4']

X0 = X.drop(columns=['home_0'])
X1 = X.drop(columns=['home_1'])
X2 = X.drop(columns=['home_2'])
X3 = X.drop(columns=['home_3'])
X4 = X.drop(columns=['home_4'])

### Train Classifiers

In [56]:
clf0.fit(X0, Y0)

In [66]:
_ = joblib.dump(clf0, 'models/clf0.joblib.pkl', compress=9)

In [57]:
clf1.fit(X1, Y1)

In [67]:
_ = joblib.dump(clf1, 'models/clf1.joblib.pkl', compress=9)

In [58]:
clf2.fit(X2, Y2)

In [68]:
_ = joblib.dump(clf2, 'models/clf2.joblib.pkl', compress=9)

In [59]:
clf3.fit(X3, Y3)

In [69]:
_ = joblib.dump(clf3, 'models/clf3.joblib.pkl', compress=9)

In [60]:
clf4.fit(X4, Y4)

In [70]:
_ = joblib.dump(clf4, 'models/clf4.joblib.pkl', compress=9)

### Evaluate with Classifiers

In [6]:
def handle_unknown_value(encoder, value):
    encoder.classes_ = np.array(list(encoder.classes_) + [value])
    return encoder.transform([value])[0]

In [8]:
#Import test data
test_data = pd.read_csv(f"NBA_test.csv")
test_X = pd.DataFrame()

#Apply label encoding to columns
for column in x_columns:
    if column == 'outcome':
        continue
    if column == 'starting_min' or column == 'season':
        test_X[column] = test_data[column]
        continue
    else:
        test_X[column] = [encoders[column].transform([x])[0] if x in encoders[column].classes_ else handle_unknown_value(encoders[column], x) for x in test_data[column]]

#Add "outcome" column with value 1 (assume winning value)
test_X['outcome'] = '1'

In [9]:
print(encoders['home_0'].transform(['?'])[0])
print(encoders['home_1'].transform(['?'])[0])
print(encoders['home_2'].transform(['?'])[0])
print(encoders['home_3'].transform(['?'])[0])
print(encoders['home_4'].transform(['?'])[0])
print(test_X)

650
775
796
779
660
     home_team  away_team  season  starting_min  home_0  home_1  home_2  \
0           12          1    2007            18     215     194     451   
1           11          7    2007            16     102     775     442   
2           30         28    2007            39      89     775      48   
3           18          1    2007            21     650     527     685   
4           15         14    2007            19     174     336     599   
..         ...        ...     ...           ...     ...     ...     ...   
995          2          5    2016            20     650     452     778   
996         32          5    2016            21     241     211     796   
997          7          9    2016            13     234     775     180   
998         22         25    2016            24      61     285     796   
999          2         23    2016            20     123     246     387   

     home_3  home_4  away_0  away_1  away_2  away_3  away_4 outcome  
0       5

In [10]:
#Import models
clf0 = joblib.load('models/clf0.joblib.pkl')
clf1 = joblib.load('models/clf1.joblib.pkl')
clf2 = joblib.load('models/clf2.joblib.pkl')
clf3 = joblib.load('models/clf3.joblib.pkl')
clf4 = joblib.load('models/clf4.joblib.pkl')

In [70]:
player_data = pd.read_csv('player_data.csv')

In [91]:
placeholder0 = encoders['home_0'].transform(['?'])[0]
placeholder1 = encoders['home_1'].transform(['?'])[0]
placeholder2 = encoders['home_2'].transform(['?'])[0]
placeholder3 = encoders['home_3'].transform(['?'])[0]
placeholder4 = encoders['home_4'].transform(['?'])[0]

def decide_class(predictions : np.ndarray, missing_feature) -> pd.Series:
    preds = pd.Series(data=predictions[0], index=classes[missing_feature])
    scores = {}
    for index, value in preds.items():
        try:
            scores[index] = 0.7 * value + 0.3 * player_data.at[index, "score"]/player_data.at[index, "total_games"]
        except KeyError:
            scores[index] = 0.7 * value
    return pd.Series(scores).idxmax() # return max

def evaluate(row : pd.Series) -> pd.Series:
    if row['home_0'] == placeholder0:
        return decide_class(clf0.predict_proba([row.drop(labels='home_0')]), 'home_0')
    elif row['home_1'] == placeholder1:
        return decide_class(clf1.predict_proba([row.drop(labels='home_1')]), 'home_1')
    elif row['home_2'] == placeholder2:
        return decide_class(clf2.predict_proba([row.drop(labels='home_2')]), 'home_2')
    elif row['home_3'] == placeholder3:
        return decide_class(clf3.predict_proba([row.drop(labels='home_3')]), 'home_3')
    elif row['home_4'] == placeholder4:
        return decide_class(clf4.predict_proba([row.drop(labels='home_4')]), 'home_4')
    else:
        print(row)
        raise Exception("Invalid row provided.")

In [92]:
predictions = test_X.apply(evaluate, axis=1)



In [93]:
print(predictions)

0            Roy Hibbert
1             Chris Paul
2        Darren Collison
3             Chris Bosh
4          Zach Randolph
             ...        
995         Courtney Lee
996    Jonas Valanciunas
997         Devin Harris
998      Harrison Barnes
999         Vince Carter
Length: 1000, dtype: object
