# NBA Predictive Machine Learning Model
## Jordan Stapinski (jstapins), Calvin Lui (clui)
### Project II - Practical Data Science 67-364 Spring 2018

### Conventions on Representations
For the process of numerically encoding our statistics to be used in a machine learning model, we use the following numerical conventions (more concretely defined in cell one of code)

#### Conferences
| Conference | Numerical Representation |
|------------|--------------------------|
| Eastern    | 0                        |
| Western    | 1                        |
#### Divisions
| Division     | Numerical Representation |
|--------------|--------------------------|
| Atlantic     | 0                        |
| Southeastern | 1                        |
| Central      | 2                        |
| Northwestern | 3                        |
| Southwestern | 4                        |
| Pacific      | 5                        |
#### Locations
| Location | Numerical Representation |
|----------|--------------------------|
| Away     | 0                        |
| Home     | 1                        |
#### Results
| Result   | Numerical Representation |
|----------|--------------------------|
| Loss     | 0                        |
| Win      | 1                        |
#### Season Types
| Season Type    | Numerical Representation |
|----------------|--------------------------|
| Preseason      | 0                        |
| Regular Season | 1                        |
| Postseason     | 2                        |
#### Roles
| Role     | Numerical Representation |
|----------|--------------------------|
| Bench    | 0                        |
| Starter  | 1                        |
#### Positions
| Position       | Numerical Representation |
|----------------|--------------------------|
| Point Guard    | 0                        |
| Guard          | 1                        |
| Shooting Guard | 2                        |
| Small Forward  | 3                        |
| Forward        | 4                        |
| Power Forward  | 5                        |
| Center         | 6                        |

In [1]:
# code used to create TEAM_DOUBLED_CSV_FILE
# creates additional row to represent away team perspective
# necessary for group by operations

'''
def duplicate_row(row):
    new_row = row.to_frame().T
    for column in new_row.columns:
        if 'team' in column:
            oppt_value = new_row[column]
            team_value = new_row[column.replace('team', 'oppt')]
            new_row[column.replace('team', 'oppt')] = oppt_value
            new_row[column] = team_value
            
    # add home/away reversed row to team_stats
    team_stats.append(new_row)
    
    # keep orignal row unchanged
    return row

team_stats = team_stats.apply(duplicate_row, axis=1)
team_stats.head(10)
'''

"\ndef duplicate_row(row):\n    new_row = row.to_frame().T\n    for column in new_row.columns:\n        if 'team' in column:\n            oppt_value = new_row[column]\n            team_value = new_row[column.replace('team', 'oppt')]\n            new_row[column.replace('team', 'oppt')] = oppt_value\n            new_row[column] = team_value\n            \n    # add home/away reversed row to team_stats\n    team_stats.append(new_row)\n    \n    # keep orignal row unchanged\n    return row\n\nteam_stats = team_stats.apply(duplicate_row, axis=1)\nteam_stats.head(10)\n"

In [2]:
# Data Importing
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd

%matplotlib inline

# Defining Constants
PLAYER_CSV_FILE = './nba-enhanced-stats/2017-18_playerBoxScore.csv'
TEAM_CSV_FILE = './nba-enhanced-stats/2017-18_teamBoxScore.csv'
TEAM_DOUBLED_CSV_FILE = './nba-enhanced-stats/2017-18_teamBoxScore_doubled.csv'
PREVIEW_LEN = 10

# Manual Categorical Mappings for Logical Encoding
teamConf_numeric = {"East": 0, "West": 1}
teamDiv_numeric = {"Atlantic": 0, "Southeast": 1, "Central": 2, "Northwest": 3, "Southwest": 4, "Pacific": 5}
teamLoc_numeric = {"Away": 0, "Home": 1}
teamRslt_numeric = {"Loss": 0, "Win": 1}
seasTyp_numeric = {"Pre": 0, "Regular": 1, "Post": 2}
playStat_numeric = {"Bench": 0, "Starter": 1}
playPos_numeric = {"PG": 0, "G": 1, "SG": 2, "SF": 3, "F": 4, "PF": 5, "C": 6}
# Team Name and Player Name will be One-Hot Encoded

# Possible Columns to Remove for Avoiding Multicollinearity
derived_team_player_cols = ['teamTREB%', 'teamASST%', "teamTS%", "teamEFG%", "teamOREB%", "teamDREB%", "teamTO%", "teamSTL%", "teamBLK%", "teamBLKR", "teamPPS", "teamFIC", "teamFIC40", "teamOrtg", "teamDrtg", "teamEDiff", "teamPlay%", "teamAR", "teamAST/TO", "teamSTL/TO"]
derived_oppt_player_cols = list(map(lambda x: x.replace("team", "oppt"), derived_team_player_cols))

# Referee Names to be Removed
useless_cols = ['offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3']

player_cols = ['gmDate', 'gmTime', 'seasTyp', 'playLNm', 'playFNm', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamDayOff', 'offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3', 'playDispNm', 'playStat', 'playMin', 'playPos', 'playHeight', 'playWeight', 'playBDate', 'playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK', 'playPF', 'playFGA', 'playFGM', 'playFG%', 'play2PA', 'play2PM', 'play2P%', 'play3PA', 'play3PM', 'play3P%', 'playFTA', 'playFTM', 'playFT%', 'playORB', 'playDRB', 'playTRB', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptDayOff']
team_cols = ['gmDate', 'gmTime', 'seasTyp', 'offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamMin', 'teamDayOff', 'teamPTS', 'teamAST', 'teamTO', 'teamSTL', 'teamBLK', 'teamPF', 'teamFGA', 'teamFGM', 'teamFG%', 'team2PA', 'team2PM', 'team2P%', 'team3PA', 'team3PM', 'team3P%', 'teamFTA', 'teamFTM', 'teamFT%', 'teamORB', 'teamDRB', 'teamTRB', 'teamPTS1', 'teamPTS2', 'teamPTS3', 'teamPTS4', 'teamPTS5', 'teamPTS6', 'teamPTS7', 'teamPTS8', 'teamTREB%', 'teamASST%', 'teamTS%', 'teamEFG%', 'teamOREB%', 'teamDREB%', 'teamTO%', 'teamSTL%', 'teamBLK%', 'teamBLKR', 'teamPPS', 'teamFIC', 'teamFIC40', 'teamOrtg', 'teamDrtg', 'teamEDiff', 'teamPlay%', 'teamAR', 'teamAST/TO', 'teamSTL/TO', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptMin', 'opptDayOff', 'opptPTS', 'opptAST', 'opptTO', 'opptSTL', 'opptBLK', 'opptPF', 'opptFGA', 'opptFGM', 'opptFG%', 'oppt2PA', 'oppt2PM', 'oppt2P%', 'oppt3PA', 'oppt3PM', 'oppt3P%', 'opptFTA', 'opptFTM', 'opptFT%', 'opptORB', 'opptDRB', 'opptTRB', 'opptPTS1', 'opptPTS2', 'opptPTS3', 'opptPTS4', 'opptPTS5', 'opptPTS6', 'opptPTS7', 'opptPTS8', 'opptTREB%', 'opptASST%', 'opptTS%', 'opptEFG%', 'opptOREB%', 'opptDREB%', 'opptTO%', 'opptSTL%', 'opptBLK%', 'opptBLKR', 'opptPPS', 'opptFIC', 'opptFIC40', 'opptOrtg', 'opptDrtg', 'opptEDiff', 'opptPlay%', 'opptAR', 'opptAST/TO', 'opptSTL/TO', 'poss', 'pace']

player_stats = pd.read_csv(PLAYER_CSV_FILE, sep=',', names=player_cols, encoding='latin-1', skiprows=[0])
team_stats = pd.read_csv(TEAM_DOUBLED_CSV_FILE, sep=',', names=team_cols, encoding='latin-1', skiprows=[0])

### Data Cleaning

Performing some data cleaning according to the above conventions, as well as removing dashes and colon in dates and times.

Furthermore, we get the stats for each team quickly before training the model.

In [3]:
# Data Cleaning for team_stats

# Date and Time Formatting
team_stats['gmDate'] = team_stats['gmDate'].apply(lambda x: x.replace("-", ""))
team_stats['gmTime'] = team_stats['gmTime'].apply(lambda x: x.replace(":", ""))

# Categorical Encoding
team_stats['teamConf'] = team_stats['teamConf'].apply(lambda x: teamConf_numeric[x])
team_stats['opptConf'] = team_stats['opptConf'].apply(lambda x: teamConf_numeric[x])
team_stats['teamDiv'] = team_stats['teamDiv'].apply(lambda x: teamDiv_numeric[x])
team_stats['opptDiv'] = team_stats['opptDiv'].apply(lambda x: teamDiv_numeric[x])
team_stats['teamLoc'] = team_stats['teamLoc'].apply(lambda x: teamLoc_numeric[x])
team_stats['opptLoc'] = team_stats['opptLoc'].apply(lambda x: teamLoc_numeric[x])
team_stats['teamRslt'] = team_stats['teamRslt'].apply(lambda x: teamRslt_numeric[x])
team_stats['opptRslt'] = team_stats['opptRslt'].apply(lambda x: teamRslt_numeric[x])
team_stats['seasTyp'] = team_stats['seasTyp'].apply(lambda x: seasTyp_numeric[x])

# Drop Useless Columns
team_stats = team_stats.drop(useless_cols, axis=1)

# Remove Duplicate Rows
# team_stats = team_stats.iloc[::2]

In [4]:
# Data Cleaning for player_stats

# Date and Time Formatting
player_stats['gmDate'] = player_stats['gmDate'].apply(lambda x: x.replace("-", ""))
player_stats['gmTime'] = player_stats['gmTime'].apply(lambda x: x.replace(":", ""))
player_stats['playBDate'] = player_stats['gmDate'].apply(lambda x: x.replace("-", ""))

# Categorical Encoding
player_stats['teamConf'] = player_stats['teamConf'].apply(lambda x: teamConf_numeric[x])
player_stats['opptConf'] = player_stats['opptConf'].apply(lambda x: teamConf_numeric[x])
player_stats['teamDiv'] = player_stats['teamDiv'].apply(lambda x: teamDiv_numeric[x])
player_stats['opptDiv'] = player_stats['opptDiv'].apply(lambda x: teamDiv_numeric[x])
player_stats['teamLoc'] = player_stats['teamLoc'].apply(lambda x: teamLoc_numeric[x])
player_stats['opptLoc'] = player_stats['opptLoc'].apply(lambda x: teamLoc_numeric[x])
player_stats['teamRslt'] = player_stats['teamRslt'].apply(lambda x: teamRslt_numeric[x])
player_stats['opptRslt'] = player_stats['opptRslt'].apply(lambda x: teamRslt_numeric[x])
player_stats['seasTyp'] = player_stats['seasTyp'].apply(lambda x: seasTyp_numeric[x])
player_stats['playStat'] = player_stats['playStat'].apply(lambda x: playStat_numeric[x])
player_stats['playPos'] = player_stats['playPos'].apply(lambda x: playPos_numeric[x])

# Drop Useless Columns
player_stats = player_stats.drop(useless_cols, axis=1)

### One-Hot Encoding
Algorithm-friendly data formatting for NBA Teams & Players.

In [5]:
# One-Hot Encoding for Players
one_hot_player = pd.get_dummies(player_stats['playDispNm'])
player_stats = player_stats.join(one_hot_player)

# Drop Processed Columns
player_stats = player_stats.drop(['playDispNm', 'playLNm', 'playFNm'], axis=1)

In [6]:
# Create Table for Use Later
# Team Representation based on Players

# join team_stats & player_stats
join_cols = ['gmDate', 'seasTyp', 'gmTime', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamDayOff', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptDayOff']
joined_stats = pd.merge(team_stats, player_stats, how='left', left_on=join_cols, right_on=join_cols, suffixes=('_t', '_p'))

func = {}
for column in joined_stats.columns:
    if column in team_cols:
        func[column] = 'first'
    elif column in player_cols:
        # drop player related statistics
        pass
    else:
        # check whether player is playing for home team
        func[column] = 'max'

# group by game + team
grp = joined_stats.groupby(['gmDate', 'teamAbbr']).agg(func)

grp.head(PREVIEW_LEN)

Unnamed: 0_level_0,Unnamed: 1_level_0,gmDate,gmTime,seasTyp,teamAbbr,teamConf,teamDiv,teamLoc,teamRslt,teamMin,teamDayOff,...,Xavier Munford,Xavier Rathan-Mayes,Xavier Silas,Yogi Ferrell,Zach Collins,Zach LaVine,Zach Randolph,Zaza Pachulia,Zhou Qi,Ãlex Abrines
gmDate,teamAbbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20171017,BOS,20171017,800,1,BOS,0,0,0,0,241,0,...,0,0,0,0,0,0,0,0,0,0
20171017,CLE,20171017,800,1,CLE,0,2,1,1,240,0,...,0,0,0,0,0,0,0,0,0,0
20171017,GS,20171017,1030,1,GS,1,5,1,0,241,0,...,0,0,0,0,0,0,0,1,0,0
20171017,HOU,20171017,1030,1,HOU,1,4,0,1,239,0,...,0,0,0,0,0,0,0,0,0,0
20171018,ATL,20171018,830,1,ATL,0,1,0,1,241,0,...,0,0,0,0,0,0,0,0,0,0
20171018,BKN,20171018,700,1,BKN,0,0,0,0,241,0,...,0,0,0,0,0,0,0,0,0,0
20171018,BOS,20171018,730,1,BOS,0,0,1,0,240,0,...,0,0,0,0,0,0,0,0,0,0
20171018,CHA,20171018,700,1,CHA,0,1,0,0,238,0,...,0,0,0,0,0,0,0,0,0,0
20171018,DAL,20171018,830,1,DAL,1,4,1,0,240,0,...,0,0,0,1,0,0,0,0,0,0
20171018,DEN,20171018,900,1,DEN,1,3,0,0,242,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# One-Hot Encoding for Teams

# team_stats
team_stats['teamAbbr'] = team_stats['teamAbbr'].apply(lambda x: "team_" + x)
one_hot_team = pd.get_dummies(team_stats['teamAbbr'])
team_stats = team_stats.join(one_hot_team)

team_stats['opptAbbr'] = team_stats['opptAbbr'].apply(lambda x: "oppt_" + x)
one_hot_oppt_team = pd.get_dummies(team_stats['opptAbbr'])
team_stats = team_stats.join(one_hot_oppt_team)

# player_stats
player_stats['teamAbbr'] = player_stats['teamAbbr'].apply(lambda x: "team_" + x)
one_hot_team = pd.get_dummies(player_stats['teamAbbr'])
player_stats = player_stats.join(one_hot_team)

player_stats['opptAbbr'] = player_stats['opptAbbr'].apply(lambda x: "oppt_" + x)
one_hot_oppt_team = pd.get_dummies(player_stats['opptAbbr'])
player_stats = player_stats.join(one_hot_oppt_team)

# Drop Processed Columns
team_stats = team_stats.drop(['teamAbbr', 'opptAbbr'], axis=1)
player_stats = player_stats.drop(['teamAbbr', 'opptAbbr'], axis=1)

In [8]:
team_stats.head(PREVIEW_LEN)

Unnamed: 0,gmDate,gmTime,seasTyp,teamConf,teamDiv,teamLoc,teamRslt,teamMin,teamDayOff,teamPTS,...,oppt_OKC,oppt_ORL,oppt_PHI,oppt_PHO,oppt_POR,oppt_SA,oppt_SAC,oppt_TOR,oppt_UTA,oppt_WAS
0,20171017,800,1,0,0,0,0,241,0,99,...,0,0,0,0,0,0,0,0,0,0
1,20171017,800,1,0,2,1,1,240,0,102,...,0,0,0,0,0,0,0,0,0,0
2,20171017,1030,1,1,4,0,1,239,0,122,...,0,0,0,0,0,0,0,0,0,0
3,20171017,1030,1,1,5,1,0,241,0,121,...,0,0,0,0,0,0,0,0,0,0
4,20171018,700,1,0,1,0,0,238,0,90,...,0,0,0,0,0,0,0,0,0,0
5,20171018,700,1,0,2,1,1,239,0,102,...,0,0,0,0,0,0,0,0,0,0
6,20171018,700,1,0,0,0,0,241,0,131,...,0,0,0,0,0,0,0,0,0,0
7,20171018,700,1,0,2,1,1,240,0,140,...,0,0,0,0,0,0,0,0,0,0
8,20171018,700,1,0,1,0,0,240,0,109,...,0,1,0,0,0,0,0,0,0,0
9,20171018,700,1,0,1,1,1,240,0,116,...,0,0,0,0,0,0,0,0,0,0


In [9]:
player_stats.head(PREVIEW_LEN)

Unnamed: 0,gmDate,gmTime,seasTyp,teamConf,teamDiv,teamLoc,teamRslt,teamDayOff,playStat,playMin,...,oppt_OKC,oppt_ORL,oppt_PHI,oppt_PHO,oppt_POR,oppt_SA,oppt_SAC,oppt_TOR,oppt_UTA,oppt_WAS
0,20171017,800,1,0,0,0,0,0,1,40,...,0,0,0,0,0,0,0,0,0,0
1,20171017,800,1,0,0,0,0,0,1,39,...,0,0,0,0,0,0,0,0,0,0
2,20171017,800,1,0,0,0,0,0,1,37,...,0,0,0,0,0,0,0,0,0,0
3,20171017,800,1,0,0,0,0,0,1,32,...,0,0,0,0,0,0,0,0,0,0
4,20171017,800,1,0,0,0,0,0,1,5,...,0,0,0,0,0,0,0,0,0,0
5,20171017,800,1,0,0,0,0,0,0,35,...,0,0,0,0,0,0,0,0,0,0
6,20171017,800,1,0,0,0,0,0,0,20,...,0,0,0,0,0,0,0,0,0,0
7,20171017,800,1,0,0,0,0,0,0,19,...,0,0,0,0,0,0,0,0,0,0
8,20171017,800,1,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
9,20171017,800,1,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0


## Baseline Predictive Model
Using a Gradient Boosting Classifier, we achieve a baseline of about 66% accuracy on game outcomes by using the teams playing as our only features. We really wish to bump this metric up as much as possible by better dissecting team and player statistics and then composing hypothetical scenarios not tested in the season.

Note this model doesn't really use any team or player statistics. Later, we will represent the team as a composition of individucal players, which should help in improving our ability to predict games.

In [10]:
# Predicting Wins only based on Teams Playing
# Can Build Another Model to Predict Points per Team
X = team_stats.drop(list(set(team_cols) - set(useless_cols) - set(['teamAbbr', 'opptAbbr'])), axis=1)
y = team_stats.teamRslt

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.63821138211382111

## Representing the Player
For representing a single player, we are going to leverage the object-oriented functionality of Python and construct a `Player` class. A player is represented with the following attributes:
- `name` (the player's name)
- `ppg` (points per game)
- `apg` (assists per game)
- `rpg` (rebounds per game)
- `spg` (steals per game)
- `bpg` (blocks per game)
- `fgapg` (field goals attempted per game)
- `fgp` (field goal percentage)
- `ftapg` (free throws attempted per game)
- `ftp` (free throw percentage)
- `gp` (games played)
- `raw_player_df` (dataframe for just this player)

and methods:
- `to_training_data`: returns a vectorized set of features for this player
- `stats_vs_team(team_abbr)`: returns players vectorized stats vs a specific team

In [149]:
class Player(object):
    def __init__(self, name, ppg, apg, rpg, spg, bpg, fgapg, fgp, ftapg, ftp, gp, raw_player_df):
        self.name = name
        self.ppg = ppg
        self.apg = apg
        self.rpg = rpg
        self.spg = spg
        self.bpg = bpg
        self.fgapg = fgapg
        self.fgp = fgp
        self.ftapg = ftapg
        self.ftp = ftp
# Debating use of below metric
#         self.tpp = tpp
        self.gp = gp
# Debating use of below metric
#         self.mpg = mpg
        self.raw_player_df = raw_player_df
        
    def stats_vs_team(self, team_abbr):
        specific_team = self.raw_player_df[self.raw_player_df['oppt_' + team_abbr] == 1]
        ppg = calculate_ppg(specific_team)
        apg = calculate_apg(specific_team)
        apg = calculate_apg(specific_team)
        trb = calculate_trb(specific_team)
        spg = calculate_spg(specific_team)
        bpg = calculate_bpg(specific_team)
        fgapg = calculate_fgapg(specific_team)
        fgp = calculate_fgp(specific_team)
        ftapg = calculate_ftapg(specific_team)
        ftp = calculate_ftp(specific_team)
        gp = len(specific_team)
        
        # return {'ppg':ppg, 'apg':apg, 'trb':trb, 'spg':spg, 'bpg':bpg, 'fgapg':fgapg, 'fgp':fgp, 'ftapg':ftapg, 'ftp':ftp, 'gp':gp}
        return [ppg, apg, trb/gp, spg, bpg, fgapg, fgp]
        
    def to_training_data(self):
        # Note: Leaving out games played
        # Removing , self.ftapg, self.ftp boosted 2%
        return [self.ppg, self.apg, self.rpg, self.spg, self.bpg, self.fgapg, self.fgp]

# Get List of players
num_non_player_cols = 41
players_registered = player_stats.columns.tolist()[num_non_player_cols:]

# Helper functions to calculate statistics (per game, per season for certain ones)
def calculate_per_game(stat, specific_player):
    total_games = len(specific_player)
    return stat / total_games   

def calculate_ppg(specific_player):
    total_points = specific_player['playPTS'].sum()
    return calculate_per_game(total_points, specific_player)

def calculate_apg(specific_player):
    total_assists = specific_player['playAST'].sum()
    return calculate_per_game(total_assists, specific_player)

def calculate_trb(specific_player):
    total_rebounds = specific_player['playTRB'].sum()
    return calculate_per_game(total_rebounds, specific_player)

def calculate_spg(specific_player):
    total_steals = specific_player['playSTL'].sum()
    return calculate_per_game(total_steals, specific_player)

def calculate_bpg(specific_player):
    total_blocks = specific_player['playBLK'].sum()
    return calculate_per_game(total_blocks, specific_player)

# Field goal attempts per game
def calculate_fgapg(specific_player):
    total_fga = specific_player['playFGA'].sum()
    return calculate_per_game(total_fga, specific_player)

# Field goals percentage made per game
def calculate_fgp(specific_player):
    return specific_player['playFG%'].mean()

# Free throw attempts per game
def calculate_ftapg(specific_player):
    total_fta = specific_player['playFTA'].sum()
    return calculate_per_game(total_fta, specific_player)

# Free throw percentage per game
def calculate_ftp(specific_player):
    return specific_player['playFT%'].mean()

all_stats = {}
for player in players_registered:
    specific_player = player_stats[player_stats[player] == 1]
    ppg = calculate_ppg(specific_player)
    apg = calculate_apg(specific_player)
    trb = calculate_trb(specific_player)
    spg = calculate_spg(specific_player)
    bpg = calculate_bpg(specific_player)
    fgapg = calculate_fgapg(specific_player)
    fgp = calculate_fgp(specific_player)
    ftapg = calculate_ftapg(specific_player)
    ftp = calculate_ftp(specific_player)
    gp = len(specific_player)
    name = player
    all_stats[player] = Player(name, ppg, apg, trb, spg, bpg, fgapg, fgp, ftapg, ftp, gp, specific_player)
    
# Sanity Check
print('Stephen Curry 2017-18 Season Statistics:')
print("Points per Game:", "%.1f" % all_stats['Stephen Curry'].ppg)
print("Assists per Game:", "%.1f" % all_stats['Stephen Curry'].apg)
print("Rebounds per Game:", "%.1f" % all_stats['Stephen Curry'].rpg)

Stephen Curry 2017-18 Season Statistics:
Points per Game: 26.4
Assists per Game: 6.1
Rebounds per Game: 5.1


![stephen_curry](images/curry_stats.png)

In [150]:
# Team Representation
grp.head(PREVIEW_LEN)

Unnamed: 0_level_0,Unnamed: 1_level_0,gmDate,gmTime,seasTyp,teamAbbr,teamConf,teamDiv,teamLoc,teamRslt,teamMin,teamDayOff,...,Xavier Munford,Xavier Rathan-Mayes,Xavier Silas,Yogi Ferrell,Zach Collins,Zach LaVine,Zach Randolph,Zaza Pachulia,Zhou Qi,Ãlex Abrines
gmDate,teamAbbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20171017,BOS,20171017,800,1,BOS,0,0,0,0,241,0,...,0,0,0,0,0,0,0,0,0,0
20171017,CLE,20171017,800,1,CLE,0,2,1,1,240,0,...,0,0,0,0,0,0,0,0,0,0
20171017,GS,20171017,1030,1,GS,1,5,1,0,241,0,...,0,0,0,0,0,0,0,1,0,0
20171017,HOU,20171017,1030,1,HOU,1,4,0,1,239,0,...,0,0,0,0,0,0,0,0,0,0
20171018,ATL,20171018,830,1,ATL,0,1,0,1,241,0,...,0,0,0,0,0,0,0,0,0,0
20171018,BKN,20171018,700,1,BKN,0,0,0,0,241,0,...,0,0,0,0,0,0,0,0,0,0
20171018,BOS,20171018,730,1,BOS,0,0,1,0,240,0,...,0,0,0,0,0,0,0,0,0,0
20171018,CHA,20171018,700,1,CHA,0,1,0,0,238,0,...,0,0,0,0,0,0,0,0,0,0
20171018,DAL,20171018,830,1,DAL,1,4,1,0,240,0,...,0,0,0,1,0,0,0,0,0,0
20171018,DEN,20171018,900,1,DEN,1,3,0,0,242,0,...,0,0,0,0,0,0,0,0,0,0


### Get Roster From Game
Gets the designated players from the passed date and team abbreviation representing the specific game played

In [151]:
# Get the roster of the team from the team_stats table

def get_roster_from_game(gmDate, teamAbbr):
    def is_name(s):
        return (len(s.split()) > 1)
    final_roster = []
    row = grp[grp['gmDate'] == gmDate]
    row = row[row['teamAbbr'] == teamAbbr]
    for col in row:
        if (row[col][0] == 1) and (is_name(col)):
            final_roster += [col]
    return final_roster

get_roster_from_game('20171017', 'BOS')

['Al Horford',
 'Aron Baynes',
 'Gordon Hayward',
 'Jaylen Brown',
 'Jayson Tatum',
 'Kyrie Irving',
 'Marcus Smart',
 'Semi Ojeleye',
 'Shane Larkin',
 'Terry Rozier']

In [188]:
final = grp.drop(list(set(grp.columns) - set(['gmDate', 'teamAbbr'])), axis=1)
final = final.drop(final.columns[0:1], axis=1)

min_reg_players = 8
def get_rosters(abbr, oppt, date):    
    roster = get_roster_from_game(date, abbr)
        
    team_arr = []
    for player in roster:
        if player in all_stats:
            team_arr.append(all_stats[player].stats_vs_team(oppt))
    numerical_roster = team_arr

    while len(numerical_roster) < 8:
        # print(len(numerical_roster))
        numerical_roster.append([0, 0, 0, 0, 0, 0, 0])
        
    return numerical_roster[:min_reg_players]

In [189]:
team_stats.columns[115:145]

Index(['team_ATL', 'team_BKN', 'team_BOS', 'team_CHA', 'team_CHI', 'team_CLE',
       'team_DAL', 'team_DEN', 'team_DET', 'team_GS', 'team_HOU', 'team_IND',
       'team_LAC', 'team_LAL', 'team_MEM', 'team_MIA', 'team_MIL', 'team_MIN',
       'team_NO', 'team_NY', 'team_OKC', 'team_ORL', 'team_PHI', 'team_PHO',
       'team_POR', 'team_SA', 'team_SAC', 'team_TOR', 'team_UTA', 'team_WAS'],
      dtype='object')

In [291]:
# CAUTION: this cell takes a lot of time to run

# where columns of team_stats begin to be team dummy columns
teams_start = 115
teams_end = 145

# basis for train/test data
X = team_stats.copy(deep=True)
new_columns = None

for game in X.iterrows():
    gmDate = game[1]['gmDate']
    gmTime = game[1]['gmTime']
    
    # get teamAbbv from dummy columns
    teamAbbv = None
    team_names = game[1][teams_start:teams_end].index.tolist()
    for i in range(len(game[1][teams_start:teams_end])):
        team_name = team_names[i]
        team_playing = game[1][teams_start:teams_end][i]
        if team_playing == 1:
            teamAbbv = team_name.replace("team_", "")
            
    # get opptAbbv from dummy columns
    opptAbbv = None
    oppt_names = game[1][teams_end:].index.tolist()
    for i in range(len(game[1][teams_end:])):
        oppt_name = oppt_names[i]
        oppt_playing = game[1][teams_end:][i]
        if oppt_playing == 1:
            opptAbbv = oppt_name.replace("oppt_", "")
            
    # assuming pk is gmDate, gmTime, teamAbbv
    dateFiltered = grp[grp['gmDate'] == gmDate]
    timeFiltered = dateFiltered[dateFiltered['gmTime'] == gmTime]
    teamFiltered = timeFiltered[timeFiltered["teamAbbr"] == teamAbbv]
    team_data = teamFiltered
    
    numeric_stats = get_rosters(team_data['teamAbbr'][0], team_data['opptAbbr'][0], team_data['gmDate'][0])
    flattened_stats = np.array(numeric_stats).flatten()
    
    if new_columns is None:
        new_columns = flattened_stats
    else:
        # print(flattened_stats.shape)
        new_columns = np.vstack((new_columns, flattened_stats))

for i in range(new_columns.shape[1]):
    new_column = new_columns[:, i]
    X['playStat' + str(i)] = new_column

In [292]:
X.head(5)

Unnamed: 0,gmDate,gmTime,seasTyp,teamConf,teamDiv,teamLoc,teamRslt,teamMin,teamDayOff,teamPTS,...,playStat46,playStat47,playStat48,playStat49,playStat50,playStat51,playStat52,playStat53,playStat54,playStat55
0,20171017,800,1,0,0,0,0,241,0,99,...,1.0,15.0,0.37055,1.333333,0.333333,0.333333,0.666667,0.0,3.666667,0.066667
1,20171017,800,1,0,2,1,1,240,0,102,...,1.0,10.0,0.26765,6.0,1.0,0.555556,0.0,0.333333,4.333333,0.318167
2,20171017,1030,1,1,4,0,1,239,0,122,...,0.333333,6.666667,0.494467,5.5,2.5,2.25,2.0,0.0,8.5,0.22915
3,20171017,1030,1,1,5,1,0,241,0,121,...,0.0,8.666667,0.686767,2.0,0.0,1.0,0.0,0.0,0.0,0.0
4,20171018,700,1,0,1,0,0,238,0,90,...,0.333333,14.0,0.472,1.5,0.5,0.0,0.0,0.0,7.0,0.05555


In [293]:
X_slow = X

In [328]:
a = list(range(9, 58))
b = list(range(64, 175))
a.extend(b)
col_indices = a
# drop useless and cheating features
X_small = X_slow.drop(X_slow.columns[col_indices], axis=1)

count = 0
for column in X_small.columns:
    print(column, count)
    count += 1

gmDate 0
gmTime 1
seasTyp 2
teamConf 3
teamDiv 4
teamLoc 5
teamRslt 6
teamMin 7
teamDayOff 8
opptConf 9
opptDiv 10
opptLoc 11
opptRslt 12
opptMin 13
opptDayOff 14
playStat0 15
playStat1 16
playStat2 17
playStat3 18
playStat4 19
playStat5 20
playStat6 21
playStat7 22
playStat8 23
playStat9 24
playStat10 25
playStat11 26
playStat12 27
playStat13 28
playStat14 29
playStat15 30
playStat16 31
playStat17 32
playStat18 33
playStat19 34
playStat20 35
playStat21 36
playStat22 37
playStat23 38
playStat24 39
playStat25 40
playStat26 41
playStat27 42
playStat28 43
playStat29 44
playStat30 45
playStat31 46
playStat32 47
playStat33 48
playStat34 49
playStat35 50
playStat36 51
playStat37 52
playStat38 53
playStat39 54
playStat40 55
playStat41 56
playStat42 57
playStat43 58
playStat44 59
playStat45 60
playStat46 61
playStat47 62
playStat48 63
playStat49 64
playStat50 65
playStat51 66
playStat52 67
playStat53 68
playStat54 69
playStat55 70


In [329]:
X_nonan = X_small.dropna(axis=0, how='any')

In [330]:
X_cols = list( set(X_nonan.columns) - set(['teamRslt', 'opptRslt']) )
y = X_nonan['teamRslt']
X = X_nonan[X_cols]

In [331]:
print(X.shape)
print(y.shape)

(2460, 69)
(2460,)


In [332]:
X.columns

Index(['playStat15', 'playStat23', 'playStat36', 'playStat7', 'playStat4',
       'playStat19', 'playStat24', 'playStat26', 'playStat31', 'playStat5',
       'playStat38', 'playStat6', 'playStat22', 'playStat18', 'playStat43',
       'playStat42', 'playStat0', 'playStat35', 'opptDayOff', 'gmDate',
       'playStat8', 'teamLoc', 'playStat50', 'playStat45', 'playStat46',
       'playStat30', 'gmTime', 'playStat10', 'playStat49', 'playStat44',
       'teamDayOff', 'playStat17', 'playStat41', 'playStat51', 'playStat34',
       'playStat21', 'playStat2', 'opptDiv', 'teamMin', 'playStat32',
       'seasTyp', 'opptMin', 'playStat20', 'playStat53', 'opptConf',
       'playStat16', 'playStat28', 'playStat29', 'teamConf', 'playStat25',
       'playStat37', 'teamDiv', 'playStat1', 'playStat11', 'playStat9',
       'playStat33', 'playStat39', 'playStat40', 'playStat3', 'playStat27',
       'playStat48', 'opptLoc', 'playStat52', 'playStat14', 'playStat54',
       'playStat47', 'playStat12', 'playSt

In [333]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

model = GradientBoostingClassifier()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.60569105691056913