# NBA Predictive Machine Learning Model
## Jordan Stapinski (jstapins), Calvin Lui (clui)
### Project II - Practical Data Science 67-364 Spring 2018

### Conventions on Representations
For the process of numerically encoding our statistics to be used in a machine learning model, we use the following numerical conventions (more concretely defined in cell one of code)

#### Conferences
| Conference | Numerical Representation |
|------------|--------------------------|
| Eastern    | 0                        |
| Western    | 1                        |
#### Divisions
| Division     | Numerical Representation |
|--------------|--------------------------|
| Atlantic     | 0                        |
| Southeastern | 1                        |
| Central      | 2                        |
| Northwestern | 3                        |
| Southwestern | 4                        |
| Pacific      | 5                        |
#### Locations
| Location | Numerical Representation |
|----------|--------------------------|
| Away     | 0                        |
| Home     | 1                        |
#### Results
| Result   | Numerical Representation |
|----------|--------------------------|
| Loss     | 0                        |
| Win      | 1                        |
#### Season Types
| Season Type    | Numerical Representation |
|----------------|--------------------------|
| Preseason      | 0                        |
| Regular Season | 1                        |
| Postseason     | 2                        |
#### Roles
| Role     | Numerical Representation |
|----------|--------------------------|
| Bench    | 0                        |
| Starter  | 1                        |
#### Positions
| Position       | Numerical Representation |
|----------------|--------------------------|
| Point Guard    | 0                        |
| Guard          | 1                        |
| Shooting Guard | 2                        |
| Small Forward  | 3                        |
| Forward        | 4                        |
| Power Forward  | 5                        |
| Center         | 6                        |

In [2]:
# Data Importing
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd

%matplotlib inline

# Defining Constants
PLAYER_CSV_FILE = './nba-enhanced-stats/2017-18_playerBoxScore.csv'
TEAM_CSV_FILE = './nba-enhanced-stats/2017-18_teamBoxScore.csv'
PREVIEW_LEN = 10

# Manual Categorical Mappings for Logical Encoding
teamConf_numeric = {"East": 0, "West": 1}
teamDiv_numeric = {"Atlantic": 0, "Southeast": 1, "Central": 2, "Northwest": 3, "Southwest": 4, "Pacific": 5}
teamLoc_numeric = {"Away": 0, "Home": 1}
teamRslt_numeric = {"Loss": 0, "Win": 1}
seasTyp_numeric = {"Pre": 0, "Regular": 1, "Post": 2}
playStat_numeric = {"Bench": 0, "Starter": 1}
playPos_numeric = {"PG": 0, "G": 1, "SG": 2, "SF": 3, "F": 4, "PF": 5, "C": 6}
# Team Name and Player Name will be One-Hot Encoded

# Possible Columns to Remove for Avoiding Multicollinearity
derived_team_player_cols = ['teamTREB%', 'teamASST%', "teamTS%", "teamEFG%", "teamOREB%", "teamDREB%", "teamTO%", "teamSTL%", "teamBLK%", "teamBLKR", "teamPPS", "teamFIC", "teamFIC40", "teamOrtg", "teamDrtg", "teamEDiff", "teamPlay%", "teamAR", "teamAST/TO", "teamSTL/TO"]
derived_oppt_player_cols = list(map(lambda x: x.replace("team", "oppt"), derived_team_player_cols))

# Referee Names to be Removed
useless_player_cols = ['offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3']

player_cols = ['gmDate', 'gmTime', 'seasTyp', 'playLNm', 'playFNm', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamDayOff', 'offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3', 'playDispNm', 'playStat', 'playMin', 'playPos', 'playHeight', 'playWeight', 'playBDate', 'playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK', 'playPF', 'playFGA', 'playFGM', 'playFG%', 'play2PA', 'play2PM', 'play2P%', 'play3PA', 'play3PM', 'play3P%', 'playFTA', 'playFTM', 'playFT%', 'playORB', 'playDRB', 'playTRB', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptDayOff']
team_cols = ['gmDate', 'gmTime', 'seasTyp', 'offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamMin', 'teamDayOff', 'teamPTS', 'teamAST', 'teamTO', 'teamSTL', 'teamBLK', 'teamPF', 'teamFGA', 'teamFGM', 'teamFG%', 'team2PA', 'team2PM', 'team2P%', 'team3PA', 'team3PM', 'team3P%', 'teamFTA', 'teamFTM', 'teamFT%', 'teamORB', 'teamDRB', 'teamTRB', 'teamPTS1', 'teamPTS2', 'teamPTS3', 'teamPTS4', 'teamPTS5', 'teamPTS6', 'teamPTS7', 'teamPTS8', 'teamTREB%', 'teamASST%', 'teamTS%', 'teamEFG%', 'teamOREB%', 'teamDREB%', 'teamTO%', 'teamSTL%', 'teamBLK%', 'teamBLKR', 'teamPPS', 'teamFIC', 'teamFIC40', 'teamOrtg', 'teamDrtg', 'teamEDiff', 'teamPlay%', 'teamAR', 'teamAST/TO', 'teamSTL/TO', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptMin', 'opptDayOff', 'opptPTS', 'opptAST', 'opptTO', 'opptSTL', 'opptBLK', 'opptPF', 'opptFGA', 'opptFGM', 'opptFG%', 'oppt2PA', 'oppt2PM', 'oppt2P%', 'oppt3PA', 'oppt3PM', 'oppt3P%', 'opptFTA', 'opptFTM', 'opptFT%', 'opptORB', 'opptDRB', 'opptTRB', 'opptPTS1', 'opptPTS2', 'opptPTS3', 'opptPTS4', 'opptPTS5', 'opptPTS6', 'opptPTS7', 'opptPTS8', 'opptTREB%', 'opptASST%', 'opptTS%', 'opptEFG%', 'opptOREB%', 'opptDREB%', 'opptTO%', 'opptSTL%', 'opptBLK%', 'opptBLKR', 'opptPPS', 'opptFIC', 'opptFIC40', 'opptOrtg', 'opptDrtg', 'opptEDiff', 'opptPlay%', 'opptAR', 'opptAST/TO', 'opptSTL/TO', 'poss', 'pace']

player_stats = pd.read_csv(PLAYER_CSV_FILE, sep=',', names=player_cols, encoding='latin-1', skiprows=[0])
team_stats = pd.read_csv(TEAM_CSV_FILE, sep=',', names=team_cols, encoding='latin-1', skiprows=[0])

### Data Cleaning

Performing some data cleaning according to the above conventions, as well as removing dashes and colon in dates and times.

Furthermore, we get the stats for each team quickly before training the model.

In [3]:
# Data Cleaning for team_stats

# Date and Time Formatting
team_stats['gmDate'] = team_stats['gmDate'].apply(lambda x: x.replace("-", ""))
team_stats['gmTime'] = team_stats['gmTime'].apply(lambda x: x.replace(":", ""))

# Categorical Encoding
team_stats['teamConf'] = team_stats['teamConf'].apply(lambda x: teamConf_numeric[x])
team_stats['opptConf'] = team_stats['opptConf'].apply(lambda x: teamConf_numeric[x])
team_stats['teamDiv'] = team_stats['teamDiv'].apply(lambda x: teamDiv_numeric[x])
team_stats['opptDiv'] = team_stats['opptDiv'].apply(lambda x: teamDiv_numeric[x])
team_stats['teamLoc'] = team_stats['teamLoc'].apply(lambda x: teamLoc_numeric[x])
team_stats['opptLoc'] = team_stats['opptLoc'].apply(lambda x: teamLoc_numeric[x])
team_stats['teamRslt'] = team_stats['teamRslt'].apply(lambda x: teamRslt_numeric[x])
team_stats['opptRslt'] = team_stats['opptRslt'].apply(lambda x: teamRslt_numeric[x])
team_stats['seasTyp'] = team_stats['seasTyp'].apply(lambda x: seasTyp_numeric[x])

# One-Hot Encoding
team_stats['teamAbbr'] = team_stats['teamAbbr'].apply(lambda x: "team_" + x)
one_hot_team = pd.get_dummies(team_stats['teamAbbr'])
team_stats = team_stats.join(one_hot_team)

team_stats['opptAbbr'] = team_stats['opptAbbr'].apply(lambda x: "oppt_" + x)
one_hot_oppt_team = pd.get_dummies(team_stats['opptAbbr'])
team_stats = team_stats.join(one_hot_oppt_team)

team_stats = team_stats.drop(['teamAbbr', 'opptAbbr'], axis=1)

# Drop Useless Columns
team_stats = team_stats.drop(useless_player_cols, axis=1)

# Remove Duplicate Rows
team_stats = team_stats.iloc[::2]

In [4]:
# Note team_stats have two rows for same referee
team_stats.head(PREVIEW_LEN)

Unnamed: 0,gmDate,gmTime,seasTyp,teamConf,teamDiv,teamLoc,teamRslt,teamMin,teamDayOff,teamPTS,...,oppt_OKC,oppt_ORL,oppt_PHI,oppt_PHO,oppt_POR,oppt_SA,oppt_SAC,oppt_TOR,oppt_UTA,oppt_WAS
0,20171017,800,1,0,0,0,0,241,0,99,...,0,0,0,0,0,0,0,0,0,0
2,20171017,1030,1,1,4,0,1,239,0,122,...,0,0,0,0,0,0,0,0,0,0
4,20171018,700,1,0,1,0,0,238,0,90,...,0,0,0,0,0,0,0,0,0,0
6,20171018,700,1,0,0,0,0,241,0,131,...,0,0,0,0,0,0,0,0,0,0
8,20171018,700,1,0,1,0,0,240,0,109,...,0,1,0,0,0,0,0,0,0,0
10,20171018,700,1,0,0,0,0,240,0,115,...,0,0,0,0,0,0,0,0,0,1
12,20171018,730,1,0,2,0,1,238,0,108,...,0,0,0,0,0,0,0,0,0,0
14,20171018,800,1,1,4,0,0,240,0,91,...,0,0,0,0,0,0,0,0,0,0
16,20171018,830,1,0,1,0,1,241,0,117,...,0,0,0,0,0,0,0,0,0,0
18,20171018,900,1,1,3,0,0,242,0,96,...,0,0,0,0,0,0,0,0,1,0


### Additional Data Cleaning
More data cleaning, for dates, times, etc.

In [5]:
# Data Cleaning for player_stats

# Date and Time Formatting
player_stats['gmDate'] = player_stats['gmDate'].apply(lambda x: x.replace("-", ""))
player_stats['gmTime'] = player_stats['gmTime'].apply(lambda x: x.replace(":", ""))
player_stats['playBDate'] = player_stats['gmDate'].apply(lambda x: x.replace("-", ""))

# Categorical Encoding
player_stats['teamConf'] = player_stats['teamConf'].apply(lambda x: teamConf_numeric[x])
player_stats['opptConf'] = player_stats['opptConf'].apply(lambda x: teamConf_numeric[x])
player_stats['teamDiv'] = player_stats['teamDiv'].apply(lambda x: teamDiv_numeric[x])
player_stats['opptDiv'] = player_stats['opptDiv'].apply(lambda x: teamDiv_numeric[x])
player_stats['teamLoc'] = player_stats['teamLoc'].apply(lambda x: teamLoc_numeric[x])
player_stats['opptLoc'] = player_stats['opptLoc'].apply(lambda x: teamLoc_numeric[x])
player_stats['teamRslt'] = player_stats['teamRslt'].apply(lambda x: teamRslt_numeric[x])
player_stats['opptRslt'] = player_stats['opptRslt'].apply(lambda x: teamRslt_numeric[x])
player_stats['seasTyp'] = player_stats['seasTyp'].apply(lambda x: seasTyp_numeric[x])
player_stats['playStat'] = player_stats['playStat'].apply(lambda x: playStat_numeric[x])
player_stats['playPos'] = player_stats['playPos'].apply(lambda x: playPos_numeric[x])

# One-Hot Encoding
player_stats['teamAbbr'] = player_stats['teamAbbr'].apply(lambda x: "team_" + x)
one_hot_team = pd.get_dummies(player_stats['teamAbbr'])
player_stats = player_stats.join(one_hot_team)

player_stats['opptAbbr'] = player_stats['opptAbbr'].apply(lambda x: "oppt_" + x)
one_hot_oppt_team = pd.get_dummies(player_stats['opptAbbr'])
player_stats = player_stats.join(one_hot_oppt_team)

one_hot_player = pd.get_dummies(player_stats['playDispNm'])
player_stats = player_stats.join(one_hot_player)

# Drop Processed Columns
player_stats = player_stats.drop(['teamAbbr', 'opptAbbr', 'playDispNm', 'playLNm', 'playFNm'], axis=1)

# Drop Useless Columns
player_stats = player_stats.drop(useless_player_cols, axis=1)

In [11]:
player_stats.head(PREVIEW_LEN)

['gmDate',
 'gmTime',
 'seasTyp',
 'teamConf',
 'teamDiv',
 'teamLoc',
 'teamRslt',
 'teamDayOff',
 'playStat',
 'playMin',
 'playPos',
 'playHeight',
 'playWeight',
 'playBDate',
 'playPTS',
 'playAST',
 'playTO',
 'playSTL',
 'playBLK',
 'playPF',
 'playFGA',
 'playFGM',
 'playFG%',
 'play2PA',
 'play2PM',
 'play2P%',
 'play3PA',
 'play3PM',
 'play3P%',
 'playFTA',
 'playFTM',
 'playFT%',
 'playORB',
 'playDRB',
 'playTRB',
 'opptConf',
 'opptDiv',
 'opptLoc',
 'opptRslt',
 'opptDayOff',
 'team_ATL',
 'team_BKN',
 'team_BOS',
 'team_CHA',
 'team_CHI',
 'team_CLE',
 'team_DAL',
 'team_DEN',
 'team_DET',
 'team_GS',
 'team_HOU',
 'team_IND',
 'team_LAC',
 'team_LAL',
 'team_MEM',
 'team_MIA',
 'team_MIL',
 'team_MIN',
 'team_NO',
 'team_NY',
 'team_OKC',
 'team_ORL',
 'team_PHI',
 'team_PHO',
 'team_POR',
 'team_SA',
 'team_SAC',
 'team_TOR',
 'team_UTA',
 'team_WAS',
 'oppt_ATL',
 'oppt_BKN',
 'oppt_BOS',
 'oppt_CHA',
 'oppt_CHI',
 'oppt_CLE',
 'oppt_DAL',
 'oppt_DEN',
 'oppt_DET',
 '

## Baseline Predictive Model
Using a Gradient Boosting Classifier, we achieve a baseline of about 66% accuracy on game outcomes by using the teams playing as our only features. We really wish to bump this metric up as much as possible by better dissecting team and player statistics and then composing hypothetical scenarios not tested in the season.

Note this model doesn't really use any team or player statistics. Later, we will represent the team as a composition of individucal players, which should help in improving our ability to predict games.

In [7]:
# Predicting Wins only based on Teams Playing
# Can Build Another Model to Predict Points per Team
X = team_stats.drop(list(set(team_cols) - set(useless_player_cols) - set(['teamAbbr', 'opptAbbr'])), axis=1)
y = team_stats.teamRslt

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)

from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6585365853658537

## Representing the Player
For representing a single player, we are going to leverage the object-oriented functionality of Python and construct a `Player` class. A player is represented with the following attributes:
- `name` (the player's name)
- `ppg` (points per game)
- `apg` (assists per game)
- `rpg` (rebounds per game)
- `spg` (steals per game)
- `bpg` (blocks per game)
- `fgapg` (field goals attempted per game)
- `fgp` (field goal percentage)
- `ftapg` (free throws attempted per game)
- `ftp` (free throw percentage)
- `gp` (games played)
- `raw_player_df` (dataframe for just this player)

and methods:
- `to_training_data`: returns a vectorized set of features for this player
- `stats_vs_specific_team(team_abbrev)`: returns players vectorized stats vs a specific team

In [19]:
class Player(object):
    def __init__(self, name, ppg, apg, rpg, spg, bpg, fgapg, fgp, ftapg, ftp, gp, raw_player_df):
        self.name = name
        self.ppg = ppg
        self.apg = apg
        self.rpg = rpg
        self.spg = spg
        self.bpg = bpg
        self.fgapg = fgapg
        self.fgp = fgp
        self.ftapg = ftapg
        self.ftp = ftp
# Debating use of below metric
#         self.tpp = tpp
        self.gp = gp
# Debating use of below metric
#         self.mpg = mpg
        self.raw_player_df = raw_player_df
        
    def stats_vs_specific_team(self, team_abbrev):
        pass
        # returns dictionary of funneled down values
        
    def to_training_data(self):
        # Note: Leaving out games played
        return [self.ppg, self.apg, self.rpg, self.spgm, self.bpg, self.fgapg, self.fgp, self.ftapg, self.ftp]

# Get List of players
num_non_player_cols = 100
players_registered = player_stats.columns.tolist()[num_non_player_cols:]

# Helper functions to calculate statistics (per game, per season for certain ones)
def calculate_per_game(stat):
    mins_per_game = 48
    total_games = (specific_player['playMin'].sum()) / mins_per_game
    return stat / total_games   

def calculate_ppg(specific_player):
    total_points = specific_player['playPTS'].sum()
    return calculate_per_game(total_points)

def calculate_apg(specific_player):
    total_assists = specific_player['playAST'].sum()
    return calculate_per_game(total_assists)

def calculate_trb(specific_player):
    total_rebounds = specific_player['playTRB'].sum()
    return calculate_per_game(total_rebounds)

def calculate_spg(specific_player):
    total_steals = specific_player['playSTL'].sum()
    return calculate_per_game(total_steals)

def calculate_bpg(specific_player):
    total_blocks = specific_player['playBLK'].sum()
    return calculate_per_game(total_blocks)

# Field goal attempts per game
def calculate_fgapg(specific_player):
    total_fga = specific_player['playFGA'].sum()
    return calculate_per_game(total_fga)

# Field goals percentage made per game
def calculate_fgp(specific_player):
    return specific_player['playFG%'].mean()

# Free throw attempts per game
def calculate_ftapg(specific_player):
    total_fta = specific_player['playFTA'].sum()
    return calculate_per_game(total_fta)

# Free throw percentage per game
def calculate_ftp(specific_player):
    return specific_player['playFT%'].mean()

all_stats = {}
for player in players_registered:
    specific_player = player_stats[player_stats[player] == 1]
    ppg = calculate_ppg(specific_player)
    apg = calculate_apg(specific_player)
    trb = calculate_trb(specific_player)
    spg = calculate_spg(specific_player)
    bpg = calculate_bpg(specific_player)
    fgapg = calculate_fgapg(specific_player)
    fgp = calculate_fgp(specific_player)
    ftapg = calculate_ftapg(specific_player)
    ftp = calculate_ftp(specific_player)
    gp = len(specific_player)
    name = player
    all_stats[player] = Player(name, ppg, apg, trb, spg, bpg, fgapg, fgp, ftapg, ftp, gp, specific_player)

all_stats

{'Aaron Brooks': <__main__.Player at 0x1a0f926550>,
 'Aaron Gordon': <__main__.Player at 0x1a0f9267b8>,
 'Aaron Harrison': <__main__.Player at 0x1a0f926a90>,
 'Aaron Jackson': <__main__.Player at 0x1a0f926d68>,
 'Abdel Nader': <__main__.Player at 0x1a0f9350b8>,
 'Adreian Payne': <__main__.Player at 0x1a0f935358>,
 'Al Horford': <__main__.Player at 0x1a0f935630>,
 'Al Jefferson': <__main__.Player at 0x1a0f935908>,
 'Al-Farouq Aminu': <__main__.Player at 0x1a0f935be0>,
 'Alan Williams': <__main__.Player at 0x1a0f935eb8>,
 'Alec Burks': <__main__.Player at 0x1a0f9251d0>,
 'Alec Peters': <__main__.Player at 0x1a0f9254a8>,
 'Alex Caruso': <__main__.Player at 0x1a0f925780>,
 'Alex Len': <__main__.Player at 0x1a0f925a58>,
 'Alex Poythress': <__main__.Player at 0x1a0f925d30>,
 'Alfonzo McKinnie': <__main__.Player at 0x1a0f93f048>,
 'Allen Crabbe': <__main__.Player at 0x1a0f93f320>,
 'Amir Johnson': <__main__.Player at 0x1a0f93f5f8>,
 'Andre Drummond': <__main__.Player at 0x1a0f93f8d0>,
 'Andre

In [9]:
# Team Generating

"""
CONVENTION: A team is represented as a list of player objects
"""

'\nCONVENTION: A team is represented as a list of player objects\n'

In [None]:
# Model Params / Features Definition

In [None]:
# Model Training

In [None]:
# Scenarios