## Creating a Model to Predict NBA Finals Champion

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 15)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import operator
%matplotlib inline

##### Dataframe of every team's seasons dating back to 1960. Columns contain basic data on award winners, basic game stats.

In [None]:
data = pd.read_csv('./nba_data.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
data['Champion'] = data['Champion'] * 1
data.set_value(1312, 'Champion', 1)

##### Data on every player, only going back to 1978. Extensive data on each player. Player efficiency ratings, age, weight, height, etc. 

In [None]:
player_data = pd.read_csv('./player_data_clean.csv')
player_data.drop('Unnamed: 0', axis=1, inplace=True)
player_data

#### ELO Ratings for Teams by Game

In [None]:
elo_data = pd.read_csv('./nbaallelo.csv')
elo_data = elo_data[elo_data.year_id > 1958]
elo_data

#### Scale the data to hopefully improve performance

In [None]:
inputs = ['Wins', 'MVP', 'Scoring Leader', 'Rebound Leader', 
          'Assist Leader', 'WS Leader', 'DPOY', 'MIP', '6MOY', 
          'Coach of Year', 'All-Stars', 'All-Defensive', 'All-NBA', 
          'FG%', '3P Attempts', '3P%', '2P Attempts', '2P%', 'FT%',
          'RPG', 'APG', 'STL', 'BLK', 'TOVPG', 'PPG', 'Accolades', ]

for inp in inputs:
    data[inp] = preprocessing.scale(data_train[inp])
    data[inp] = preprocessing.scale(data_test[inp])


#### Function takes top three predictions; looks at the team's Win Shares, Player Efficiency Ratings, Age, Weight in an attempt to rank them. 

In [None]:
def choose_winner(predictions, year):
    ratings = []
    for prediction in predictions:
        rating = (player_data[(player_data['Team'] == prediction) & (player_data['Year'] == year)]['WS'].sum()
                *player_data[(player_data['Team'] == prediction) & (player_data['Year'] == year)]['PER'].sum()
                -player_data[(player_data['Team'] == prediction) & (player_data['Year'] == year)]['Age'].sum()
                -player_data[(player_data['Team'] == prediction) & (player_data['Year'] == year)]['Weight'].mean()
                 )

        ratings.append((rating,prediction))
        
    return max(ratings)

#### Classification Model
This is a sort of manual cross validation. Each year is selected as the test data one at a time,
 while every other year is used as training. The top 3 probabilities are taken as the top three predictions. This is necessary because the probability of winning a championship is very low. This was needed otherwise the model predicts winners very rarely. The reason player data was not used as training data for the classification model is because we were not able to find a way to merge the dataframes without losing information.

In [None]:
inputs = ['Wins', 'MVP', 'DPOY', '6MOY', 'Coach of Year', 'Accolades', 'Scoring Leader',
          'All-NBA', 'RPG', 'TOVPG'
         ]

model = RandomForestClassifier(n_estimators = 50)
TP = 0
TN = 0
FP = 0
FN = 0

for i in range(1960,2018):
    teams = []
    model.fit(data[data.Year != i][inputs], data[data.Year != i]['Champion'])
    predictions = model.predict_proba(data[data.Year == i][inputs])
    predictions = [m[1] for m in predictions]    
    
    index, value = max(enumerate(predictions), key=operator.itemgetter(1))
    predicted1 = data[data.Year == i].iloc[index]['Team']
    predictions[index] = 0
    index2, value2 = max(enumerate(predictions), key=operator.itemgetter(1))
    predicted2 = data[data.Year == i].iloc[index2]['Team']
    predictions[index2] = 0
    index3, value3 = max(enumerate(predictions), key=operator.itemgetter(1))
    predicted3 = data[data.Year == i].iloc[index3]['Team']
    
    predicted = choose_winner((predicted1, predicted2, predicted3), i)[1]
    
    actual = data[(data.Year == i) & (data.Champion == 1)].iloc[0]['Team']
    if predicted == actual:
        TP += 1
        TN += len(predictions)-1
    else:
        FP += 1
        FN += 1
        TN += len(predictions)-2
    
print('TP:',TP)
print('FP:',FP)
print('TN:',TN)
print('FN:',FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print('F1:', 1/((1/precision + 1/recall)/2))

#### Dummy Model
It would be very bad to have performance anywhere near the dummy model

In [None]:
from sklearn.dummy import DummyClassifier
model = DummyClassifier()
TP = 0
TN = 0
FP = 0
FN = 0
for i in range(1960,2018):
    teams = []
    model.fit(data_train[data_train.Year != i][inputs], data_train[data_train.Year != i]['Champion'])
    predictions = model.predict_proba(data[data.Year == i][inputs])
    predictions = [m[1] for m in predictions]    
    index, value = max(enumerate(predictions), key=operator.itemgetter(1))
    
    predicted = data[data.Year == i].iloc[index]['Team']
    actual = data[(data.Year == i) & (data.Champion == 1)].iloc[0]['Team']
    if predicted == actual:
        TP += 1
        TN += len(predictions)-1
    if predicted != actual:
        FP += 1
        FN += 1
        TN += len(predictions)-2
    
print('TP:',TP)
print('FP:',FP)
print('TN:',TN)
print('FN:',FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
print('F1:', 1/((1/precision + 1/recall)/2))



### Trying every combination of inputs
- Note: Takes a long time to run

In [None]:
inputs = ['Wins','MVP','Scoring Leader','Rebound Leader','Assist Leader',
          'WS Leader','MIP','6MOY','Coach of Year','All-Stars',
          'All-Defensive','All-NBA','FG%','3P Attempts','3P%',
          '2P%', 'FT%', 'RPG', 'APG', 'STL', 'BLK', 'TOVPG', 'PPG', 'Accolades'
         ]

scores = []
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
#Loop through all possible combinations of keywords. 
inpu = np.array(inputs)
for i in range(2**len(inpu)-5,2**len(inpu)):
    mask = list(format(i, '016b'))
    mask = np.array([x=='1' for x in mask])
    combo = inpu[mask].tolist()
    
    for i in range(1960,2018):
        model.fit(data_train[data_train.Year != i][combo], data_train[data_train.Year != i]['Champion'])
        predictions = model.predict_proba(data[data.Year == i][combo])
        predictions = [m[1] for m in predictions]    
        index, value = max(enumerate(predictions), key=operator.itemgetter(1))

        predicted = data[data.Year == i].iloc[index]['Team']

        actual = data[(data.Year == i) & (data.Champion == 1)].iloc[0]['Team']
        if predicted == actual:
            TP += 1
            TN += len(predictions)-1
        if predicted != actual:
            FP += 1
            FN += 1
            TN += len(predictions)-2

    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    F1 = ( 1/((1/precision + 1/recall)/2))

    scores.append((F1, combo))
print(max(scores))

