# NBA Predictive Machine Learning Model
## Jordan Stapinski (jstapins), Calvin Lui (clui)
### Project II - Practical Data Science 67-364 Spring 2018

### Conventions on Representations
For the process of numerically encoding our statistics to be used in a machine learning model, we use the following numerical conventions (more concretely defined in cell one of code)

#### Conferences
| Conference | Numerical Representation |
|------------|--------------------------|
| Eastern    | 0                        |
| Western    | 1                        |
#### Divisions
| Division     | Numerical Representation |
|--------------|--------------------------|
| Atlantic     | 0                        |
| Southeastern | 1                        |
| Central      | 2                        |
| Northwestern | 3                        |
| Southwestern | 4                        |
| Pacific      | 5                        |
#### Locations
| Location | Numerical Representation |
|----------|--------------------------|
| Away     | 0                        |
| Home     | 1                        |
#### Results
| Result   | Numerical Representation |
|----------|--------------------------|
| Loss     | 0                        |
| Win      | 1                        |
#### Season Types
| Season Type    | Numerical Representation |
|----------------|--------------------------|
| Preseason      | 0                        |
| Regular Season | 1                        |
| Postseason     | 2                        |
#### Roles
| Role     | Numerical Representation |
|----------|--------------------------|
| Bench    | 0                        |
| Starter  | 1                        |
#### Positions
| Position       | Numerical Representation |
|----------------|--------------------------|
| Point Guard    | 0                        |
| Guard          | 1                        |
| Shooting Guard | 2                        |
| Small Forward  | 3                        |
| Forward        | 4                        |
| Power Forward  | 5                        |
| Center         | 6                        |

In [1]:
# code used to create TEAM_DOUBLED_CSV_FILE
# creates additional row to represent away team perspective
# necessary for group by operations

'''
def duplicate_row(row):
    new_row = row.to_frame().T
    for column in new_row.columns:
        if 'team' in column:
            oppt_value = new_row[column]
            team_value = new_row[column.replace('team', 'oppt')]
            new_row[column.replace('team', 'oppt')] = oppt_value
            new_row[column] = team_value
            
    # add home/away reversed row to team_stats
    team_stats.append(new_row)
    
    # keep orignal row unchanged
    return row

team_stats = team_stats.apply(duplicate_row, axis=1)
team_stats.head(10)
'''

"\ndef duplicate_row(row):\n    new_row = row.to_frame().T\n    for column in new_row.columns:\n        if 'team' in column:\n            oppt_value = new_row[column]\n            team_value = new_row[column.replace('team', 'oppt')]\n            new_row[column.replace('team', 'oppt')] = oppt_value\n            new_row[column] = team_value\n            \n    # add home/away reversed row to team_stats\n    team_stats.append(new_row)\n    \n    # keep orignal row unchanged\n    return row\n\nteam_stats = team_stats.apply(duplicate_row, axis=1)\nteam_stats.head(10)\n"

In [2]:
# Data Importing
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd

%matplotlib inline

# Defining Constants
PLAYER_CSV_FILE = './nba-enhanced-stats/2017-18_playerBoxScore.csv'
TEAM_CSV_FILE = './nba-enhanced-stats/2017-18_teamBoxScore.csv'
TEAM_DOUBLED_CSV_FILE = './nba-enhanced-stats/2017-18_teamBoxScore_doubled.csv'
PREVIEW_LEN = 10

# Manual Categorical Mappings for Logical Encoding
teamConf_numeric = {"East": 0, "West": 1}
teamDiv_numeric = {"Atlantic": 0, "Southeast": 1, "Central": 2, "Northwest": 3, "Southwest": 4, "Pacific": 5}
teamLoc_numeric = {"Away": 0, "Home": 1}
teamRslt_numeric = {"Loss": 0, "Win": 1}
seasTyp_numeric = {"Pre": 0, "Regular": 1, "Post": 2}
playStat_numeric = {"Bench": 0, "Starter": 1}
playPos_numeric = {"PG": 0, "G": 1, "SG": 2, "SF": 3, "F": 4, "PF": 5, "C": 6}
# Team Name and Player Name will be One-Hot Encoded

# Possible Columns to Remove for Avoiding Multicollinearity
derived_team_player_cols = ['teamTREB%', 'teamASST%', "teamTS%", "teamEFG%", "teamOREB%", "teamDREB%", "teamTO%", "teamSTL%", "teamBLK%", "teamBLKR", "teamPPS", "teamFIC", "teamFIC40", "teamOrtg", "teamDrtg", "teamEDiff", "teamPlay%", "teamAR", "teamAST/TO", "teamSTL/TO"]
derived_oppt_player_cols = list(map(lambda x: x.replace("team", "oppt"), derived_team_player_cols))

# Referee Names to be Removed
useless_cols = ['offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3']

player_cols = ['gmDate', 'gmTime', 'seasTyp', 'playLNm', 'playFNm', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamDayOff', 'offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3', 'playDispNm', 'playStat', 'playMin', 'playPos', 'playHeight', 'playWeight', 'playBDate', 'playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK', 'playPF', 'playFGA', 'playFGM', 'playFG%', 'play2PA', 'play2PM', 'play2P%', 'play3PA', 'play3PM', 'play3P%', 'playFTA', 'playFTM', 'playFT%', 'playORB', 'playDRB', 'playTRB', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptDayOff']
team_cols = ['gmDate', 'gmTime', 'seasTyp', 'offLNm1', 'offFNm1', 'offLNm2', 'offFNm2', 'offLNm3', 'offFNm3', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamMin', 'teamDayOff', 'teamPTS', 'teamAST', 'teamTO', 'teamSTL', 'teamBLK', 'teamPF', 'teamFGA', 'teamFGM', 'teamFG%', 'team2PA', 'team2PM', 'team2P%', 'team3PA', 'team3PM', 'team3P%', 'teamFTA', 'teamFTM', 'teamFT%', 'teamORB', 'teamDRB', 'teamTRB', 'teamPTS1', 'teamPTS2', 'teamPTS3', 'teamPTS4', 'teamPTS5', 'teamPTS6', 'teamPTS7', 'teamPTS8', 'teamTREB%', 'teamASST%', 'teamTS%', 'teamEFG%', 'teamOREB%', 'teamDREB%', 'teamTO%', 'teamSTL%', 'teamBLK%', 'teamBLKR', 'teamPPS', 'teamFIC', 'teamFIC40', 'teamOrtg', 'teamDrtg', 'teamEDiff', 'teamPlay%', 'teamAR', 'teamAST/TO', 'teamSTL/TO', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptMin', 'opptDayOff', 'opptPTS', 'opptAST', 'opptTO', 'opptSTL', 'opptBLK', 'opptPF', 'opptFGA', 'opptFGM', 'opptFG%', 'oppt2PA', 'oppt2PM', 'oppt2P%', 'oppt3PA', 'oppt3PM', 'oppt3P%', 'opptFTA', 'opptFTM', 'opptFT%', 'opptORB', 'opptDRB', 'opptTRB', 'opptPTS1', 'opptPTS2', 'opptPTS3', 'opptPTS4', 'opptPTS5', 'opptPTS6', 'opptPTS7', 'opptPTS8', 'opptTREB%', 'opptASST%', 'opptTS%', 'opptEFG%', 'opptOREB%', 'opptDREB%', 'opptTO%', 'opptSTL%', 'opptBLK%', 'opptBLKR', 'opptPPS', 'opptFIC', 'opptFIC40', 'opptOrtg', 'opptDrtg', 'opptEDiff', 'opptPlay%', 'opptAR', 'opptAST/TO', 'opptSTL/TO', 'poss', 'pace']

player_stats = pd.read_csv(PLAYER_CSV_FILE, sep=',', names=player_cols, encoding='latin-1', skiprows=[0])
team_stats = pd.read_csv(TEAM_DOUBLED_CSV_FILE, sep=',', names=team_cols, encoding='latin-1', skiprows=[0])

### Data Cleaning

Performing some data cleaning according to the above conventions, as well as removing dashes and colon in dates and times.

Furthermore, we get the stats for each team quickly before training the model.

In [3]:
# Data Cleaning for team_stats

# Date and Time Formatting
team_stats['gmDate'] = team_stats['gmDate'].apply(lambda x: x.replace("-", ""))
team_stats['gmTime'] = team_stats['gmTime'].apply(lambda x: x.replace(":", ""))

# Categorical Encoding
team_stats['teamConf'] = team_stats['teamConf'].apply(lambda x: teamConf_numeric[x])
team_stats['opptConf'] = team_stats['opptConf'].apply(lambda x: teamConf_numeric[x])
team_stats['teamDiv'] = team_stats['teamDiv'].apply(lambda x: teamDiv_numeric[x])
team_stats['opptDiv'] = team_stats['opptDiv'].apply(lambda x: teamDiv_numeric[x])
team_stats['teamLoc'] = team_stats['teamLoc'].apply(lambda x: teamLoc_numeric[x])
team_stats['opptLoc'] = team_stats['opptLoc'].apply(lambda x: teamLoc_numeric[x])
team_stats['teamRslt'] = team_stats['teamRslt'].apply(lambda x: teamRslt_numeric[x])
team_stats['opptRslt'] = team_stats['opptRslt'].apply(lambda x: teamRslt_numeric[x])
team_stats['seasTyp'] = team_stats['seasTyp'].apply(lambda x: seasTyp_numeric[x])

# Drop Useless Columns
team_stats = team_stats.drop(useless_cols, axis=1)

# Remove Duplicate Rows
# team_stats = team_stats.iloc[::2]

In [4]:
# Data Cleaning for player_stats

# Date and Time Formatting
player_stats['gmDate'] = player_stats['gmDate'].apply(lambda x: x.replace("-", ""))
player_stats['gmTime'] = player_stats['gmTime'].apply(lambda x: x.replace(":", ""))
player_stats['playBDate'] = player_stats['gmDate'].apply(lambda x: x.replace("-", ""))

# Categorical Encoding
player_stats['teamConf'] = player_stats['teamConf'].apply(lambda x: teamConf_numeric[x])
player_stats['opptConf'] = player_stats['opptConf'].apply(lambda x: teamConf_numeric[x])
player_stats['teamDiv'] = player_stats['teamDiv'].apply(lambda x: teamDiv_numeric[x])
player_stats['opptDiv'] = player_stats['opptDiv'].apply(lambda x: teamDiv_numeric[x])
player_stats['teamLoc'] = player_stats['teamLoc'].apply(lambda x: teamLoc_numeric[x])
player_stats['opptLoc'] = player_stats['opptLoc'].apply(lambda x: teamLoc_numeric[x])
player_stats['teamRslt'] = player_stats['teamRslt'].apply(lambda x: teamRslt_numeric[x])
player_stats['opptRslt'] = player_stats['opptRslt'].apply(lambda x: teamRslt_numeric[x])
player_stats['seasTyp'] = player_stats['seasTyp'].apply(lambda x: seasTyp_numeric[x])
player_stats['playStat'] = player_stats['playStat'].apply(lambda x: playStat_numeric[x])
player_stats['playPos'] = player_stats['playPos'].apply(lambda x: playPos_numeric[x])

# Drop Useless Columns
player_stats = player_stats.drop(useless_cols, axis=1)

### One-Hot Encoding
Algorithm-friendly data formatting for NBA Teams & Players.

In [5]:
# One-Hot Encoding for Players
one_hot_player = pd.get_dummies(player_stats['playDispNm'])
player_stats = player_stats.join(one_hot_player)

# Drop Processed Columns
player_stats = player_stats.drop(['playDispNm', 'playLNm', 'playFNm'], axis=1)

In [6]:
# Create Table for Use Later
# Team Representation based on Players

# join team_stats & player_stats
join_cols = ['gmDate', 'seasTyp', 'gmTime', 'teamAbbr', 'teamConf', 'teamDiv', 'teamLoc', 'teamRslt', 'teamDayOff', 'opptAbbr', 'opptConf', 'opptDiv', 'opptLoc', 'opptRslt', 'opptDayOff']
joined_stats = pd.merge(team_stats, player_stats, how='left', left_on=join_cols, right_on=join_cols, suffixes=('_t', '_p'))

func = {}
for column in joined_stats.columns:
    if column in team_cols:
        func[column] = 'first'
    elif column in player_cols:
        # drop player related statistics
        pass
    else:
        # check whether player is playing for home team
        func[column] = 'max'

# group by game + team
grp = joined_stats.groupby(['gmDate', 'teamAbbr']).agg(func)

grp.head(PREVIEW_LEN)

Unnamed: 0_level_0,Unnamed: 1_level_0,teamSTL%,Bradley Beal,London Perrantes,Yogi Ferrell,Amir Johnson,R.J. Hunter,teamPTS5,Arron Afflalo,Ben Moore,Lance Thomas,...,opptTREB%,T.J. Warren,team2PA,C.J. Miles,Kawhi Leonard,Derrick Favors,Karl-Anthony Towns,opptEDiff,opptTO%,Jordan Bell
gmDate,teamAbbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20171017,BOS,10.9692,0,0,0,0,0,0,0,0,0,...,52.0833,0,56,0,0,0,0,2.9916,15.3153,0
20171017,CLE,2.9916,0,0,0,0,0,0,0,0,0,...,47.9167,0,61,0,0,0,0,-2.9916,10.8108,0
20171017,GS,4.8775,0,0,0,0,0,0,0,0,0,...,51.1905,0,50,0,0,0,0,0.9755,10.9834,1
20171017,HOU,8.7795,0,0,0,0,0,0,0,0,0,...,48.8095,0,56,0,0,0,0,-0.9755,16.0015,0
20171018,ATL,11.1027,0,0,0,0,0,0,0,0,0,...,45.0549,0,76,0,0,0,0,-6.056,14.3833,0
20171018,BKN,6.18,0,0,0,0,0,0,0,0,0,...,52.2222,0,64,0,0,0,0,7.9457,10.7626,0
20171018,BOS,12.0761,0,0,0,0,0,0,0,0,0,...,51.1364,0,63,0,0,0,0,8.0508,14.3954,0
20171018,CHA,4.0412,0,0,0,0,0,0,0,0,0,...,50.0,0,43,0,0,0,0,12.1236,8.161,0
20171018,DAL,5.0467,0,0,1,0,0,0,0,0,0,...,54.9451,0,41,0,0,0,0,6.056,11.4437,0
20171018,DEN,4.1894,0,0,0,0,0,0,0,0,0,...,48.6486,0,50,0,0,0,0,10.4735,14.5575,0


In [7]:
# One-Hot Encoding for Teams

# team_stats
team_stats['teamAbbr'] = team_stats['teamAbbr'].apply(lambda x: "team_" + x)
one_hot_team = pd.get_dummies(team_stats['teamAbbr'])
team_stats = team_stats.join(one_hot_team)

team_stats['opptAbbr'] = team_stats['opptAbbr'].apply(lambda x: "oppt_" + x)
one_hot_oppt_team = pd.get_dummies(team_stats['opptAbbr'])
team_stats = team_stats.join(one_hot_oppt_team)

# player_stats
player_stats['teamAbbr'] = player_stats['teamAbbr'].apply(lambda x: "team_" + x)
one_hot_team = pd.get_dummies(player_stats['teamAbbr'])
player_stats = player_stats.join(one_hot_team)

player_stats['opptAbbr'] = player_stats['opptAbbr'].apply(lambda x: "oppt_" + x)
one_hot_oppt_team = pd.get_dummies(player_stats['opptAbbr'])
player_stats = player_stats.join(one_hot_oppt_team)

# Drop Processed Columns
team_stats = team_stats.drop(['teamAbbr', 'opptAbbr'], axis=1)
player_stats = player_stats.drop(['teamAbbr', 'opptAbbr'], axis=1)

In [8]:
team_stats.head(PREVIEW_LEN)

Unnamed: 0,gmDate,gmTime,seasTyp,teamConf,teamDiv,teamLoc,teamRslt,teamMin,teamDayOff,teamPTS,...,oppt_OKC,oppt_ORL,oppt_PHI,oppt_PHO,oppt_POR,oppt_SA,oppt_SAC,oppt_TOR,oppt_UTA,oppt_WAS
0,20171017,800,1,0,0,0,0,241,0,99,...,0,0,0,0,0,0,0,0,0,0
1,20171017,800,1,0,2,1,1,240,0,102,...,0,0,0,0,0,0,0,0,0,0
2,20171017,1030,1,1,4,0,1,239,0,122,...,0,0,0,0,0,0,0,0,0,0
3,20171017,1030,1,1,5,1,0,241,0,121,...,0,0,0,0,0,0,0,0,0,0
4,20171018,700,1,0,1,0,0,238,0,90,...,0,0,0,0,0,0,0,0,0,0
5,20171018,700,1,0,2,1,1,239,0,102,...,0,0,0,0,0,0,0,0,0,0
6,20171018,700,1,0,0,0,0,241,0,131,...,0,0,0,0,0,0,0,0,0,0
7,20171018,700,1,0,2,1,1,240,0,140,...,0,0,0,0,0,0,0,0,0,0
8,20171018,700,1,0,1,0,0,240,0,109,...,0,1,0,0,0,0,0,0,0,0
9,20171018,700,1,0,1,1,1,240,0,116,...,0,0,0,0,0,0,0,0,0,0


In [9]:
player_stats.head(PREVIEW_LEN)

Unnamed: 0,gmDate,gmTime,seasTyp,teamConf,teamDiv,teamLoc,teamRslt,teamDayOff,playStat,playMin,...,oppt_OKC,oppt_ORL,oppt_PHI,oppt_PHO,oppt_POR,oppt_SA,oppt_SAC,oppt_TOR,oppt_UTA,oppt_WAS
0,20171017,800,1,0,0,0,0,0,1,40,...,0,0,0,0,0,0,0,0,0,0
1,20171017,800,1,0,0,0,0,0,1,39,...,0,0,0,0,0,0,0,0,0,0
2,20171017,800,1,0,0,0,0,0,1,37,...,0,0,0,0,0,0,0,0,0,0
3,20171017,800,1,0,0,0,0,0,1,32,...,0,0,0,0,0,0,0,0,0,0
4,20171017,800,1,0,0,0,0,0,1,5,...,0,0,0,0,0,0,0,0,0,0
5,20171017,800,1,0,0,0,0,0,0,35,...,0,0,0,0,0,0,0,0,0,0
6,20171017,800,1,0,0,0,0,0,0,20,...,0,0,0,0,0,0,0,0,0,0
7,20171017,800,1,0,0,0,0,0,0,19,...,0,0,0,0,0,0,0,0,0,0
8,20171017,800,1,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
9,20171017,800,1,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0


## Baseline Predictive Model
Using a Gradient Boosting Classifier, we achieve a baseline of about 66% accuracy on game outcomes by using the teams playing as our only features. We really wish to bump this metric up as much as possible by better dissecting team and player statistics and then composing hypothetical scenarios not tested in the season.

Note this model doesn't really use any team or player statistics. Later, we will represent the team as a composition of individucal players, which should help in improving our ability to predict games.

In [39]:
# Predicting Wins only based on Teams Playing
# Can Build Another Model to Predict Points per Team
X = team_stats.drop(list(set(team_cols) - set(useless_cols) - set(['teamAbbr', 'opptAbbr'])), axis=1)
y = team_stats.teamRslt

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6341463414634146

## Representing the Player
For representing a single player, we are going to leverage the object-oriented functionality of Python and construct a `Player` class. A player is represented with the following attributes:
- `name` (the player's name)
- `ppg` (points per game)
- `apg` (assists per game)
- `rpg` (rebounds per game)
- `spg` (steals per game)
- `bpg` (blocks per game)
- `fgapg` (field goals attempted per game)
- `fgp` (field goal percentage)
- `ftapg` (free throws attempted per game)
- `ftp` (free throw percentage)
- `gp` (games played)
- `raw_player_df` (dataframe for just this player)

and methods:
- `to_training_data`: returns a vectorized set of features for this player
- `stats_vs_team(team_abbr)`: returns players vectorized stats vs a specific team

In [11]:
class Player(object):
    def __init__(self, name, ppg, apg, rpg, spg, bpg, fgapg, fgp, ftapg, ftp, gp, raw_player_df):
        self.name = name
        self.ppg = ppg
        self.apg = apg
        self.rpg = rpg
        self.spg = spg
        self.bpg = bpg
        self.fgapg = fgapg
        self.fgp = fgp
        self.ftapg = ftapg
        self.ftp = ftp
# Debating use of below metric
#         self.tpp = tpp
        self.gp = gp
# Debating use of below metric
#         self.mpg = mpg
        self.raw_player_df = raw_player_df
        
    def stats_vs_team(self, team_abbr):
        specific_team = self.raw_player_df[self.raw_player_df['oppt_' + team_abbr] == 1]
        ppg = calculate_ppg(specific_team)
        apg = calculate_apg(specific_team)
        trb = calculate_trb(specific_team)
        spg = calculate_spg(specific_team)
        bpg = calculate_bpg(specific_team)
        fgapg = calculate_fgapg(specific_team)
        fgp = calculate_fgp(specific_team)
        ftapg = calculate_ftapg(specific_team)
        ftp = calculate_ftp(specific_team)
        gp = len(specific_team)
        
        return {'ppg':ppg, 'apg':apg, 'trb':trb, 'spg':spg, 'bpg':bpg, 'fgapg':fgapg, 'fgp':fgp, 'ftapg':ftapg, 'ftp':ftp, 'gp':gp}
        
    def to_training_data(self):
        # Note: Leaving out games played
        # Removing , self.ftapg, self.ftp boosted 2%
        return [self.ppg, self.apg, self.rpg, self.spg, self.bpg, self.fgapg, self.fgp]

# Get List of players
num_non_player_cols = 41
players_registered = player_stats.columns.tolist()[num_non_player_cols:]

# Helper functions to calculate statistics (per game, per season for certain ones)
def calculate_per_game(stat, specific_player):
    total_games = len(specific_player)
    return stat / total_games   

def calculate_ppg(specific_player):
    total_points = specific_player['playPTS'].sum()
    return calculate_per_game(total_points, specific_player)

def calculate_apg(specific_player):
    total_assists = specific_player['playAST'].sum()
    return calculate_per_game(total_assists, specific_player)

def calculate_trb(specific_player):
    total_rebounds = specific_player['playTRB'].sum()
    return calculate_per_game(total_rebounds, specific_player)

def calculate_spg(specific_player):
    total_steals = specific_player['playSTL'].sum()
    return calculate_per_game(total_steals, specific_player)

def calculate_bpg(specific_player):
    total_blocks = specific_player['playBLK'].sum()
    return calculate_per_game(total_blocks, specific_player)

# Field goal attempts per game
def calculate_fgapg(specific_player):
    total_fga = specific_player['playFGA'].sum()
    return calculate_per_game(total_fga, specific_player)

# Field goals percentage made per game
def calculate_fgp(specific_player):
    return specific_player['playFG%'].mean()

# Free throw attempts per game
def calculate_ftapg(specific_player):
    total_fta = specific_player['playFTA'].sum()
    return calculate_per_game(total_fta, specific_player)

# Free throw percentage per game
def calculate_ftp(specific_player):
    return specific_player['playFT%'].mean()

all_stats = {}
for player in players_registered:
    specific_player = player_stats[player_stats[player] == 1]
    ppg = calculate_ppg(specific_player)
    apg = calculate_apg(specific_player)
    trb = calculate_trb(specific_player)
    spg = calculate_spg(specific_player)
    bpg = calculate_bpg(specific_player)
    fgapg = calculate_fgapg(specific_player)
    fgp = calculate_fgp(specific_player)
    ftapg = calculate_ftapg(specific_player)
    ftp = calculate_ftp(specific_player)
    gp = len(specific_player)
    name = player
    all_stats[player] = Player(name, ppg, apg, trb, spg, bpg, fgapg, fgp, ftapg, ftp, gp, specific_player)
    
# Sanity Check
print('Stephen Curry 2017-18 Season Statistics:')
print("Points per Game:", "%.1f" % all_stats['Stephen Curry'].ppg)
print("Assists per Game:", "%.1f" % all_stats['Stephen Curry'].apg)
print("Rebounds per Game:", "%.1f" % all_stats['Stephen Curry'].rpg)

Stephen Curry 2017-18 Season Statistics:
Points per Game: 26.4
Assists per Game: 6.1
Rebounds per Game: 5.1


![stephen_curry](images/curry_stats.png)

In [12]:
# Team Representation
grp.head(PREVIEW_LEN)

Unnamed: 0_level_0,Unnamed: 1_level_0,teamSTL%,Bradley Beal,London Perrantes,Yogi Ferrell,Amir Johnson,R.J. Hunter,teamPTS5,Arron Afflalo,Ben Moore,Lance Thomas,...,opptTREB%,T.J. Warren,team2PA,C.J. Miles,Kawhi Leonard,Derrick Favors,Karl-Anthony Towns,opptEDiff,opptTO%,Jordan Bell
gmDate,teamAbbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20171017,BOS,10.9692,0,0,0,0,0,0,0,0,0,...,52.0833,0,56,0,0,0,0,2.9916,15.3153,0
20171017,CLE,2.9916,0,0,0,0,0,0,0,0,0,...,47.9167,0,61,0,0,0,0,-2.9916,10.8108,0
20171017,GS,4.8775,0,0,0,0,0,0,0,0,0,...,51.1905,0,50,0,0,0,0,0.9755,10.9834,1
20171017,HOU,8.7795,0,0,0,0,0,0,0,0,0,...,48.8095,0,56,0,0,0,0,-0.9755,16.0015,0
20171018,ATL,11.1027,0,0,0,0,0,0,0,0,0,...,45.0549,0,76,0,0,0,0,-6.056,14.3833,0
20171018,BKN,6.18,0,0,0,0,0,0,0,0,0,...,52.2222,0,64,0,0,0,0,7.9457,10.7626,0
20171018,BOS,12.0761,0,0,0,0,0,0,0,0,0,...,51.1364,0,63,0,0,0,0,8.0508,14.3954,0
20171018,CHA,4.0412,0,0,0,0,0,0,0,0,0,...,50.0,0,43,0,0,0,0,12.1236,8.161,0
20171018,DAL,5.0467,0,0,1,0,0,0,0,0,0,...,54.9451,0,41,0,0,0,0,6.056,11.4437,0
20171018,DEN,4.1894,0,0,0,0,0,0,0,0,0,...,48.6486,0,50,0,0,0,0,10.4735,14.5575,0


### Get Roster From Game
Gets the designated players from the passed date and team abbreviation representing the specific game played

In [13]:
# Get the roster of the team from the team_stats table

def get_roster_from_game(gmDate, teamAbbr):
    def is_name(s):
        return (len(s.split()) > 1)
    final_roster = []
    row = grp[grp['gmDate'] == gmDate]
    row = row[row['teamAbbr'] == teamAbbr]
    for col in row:
        if (row[col][0] == 1) and (is_name(col)):
            final_roster += [col]
    return final_roster

get_roster_from_game('20171017', 'BOS')

['Terry Rozier',
 'Gordon Hayward',
 'Marcus Smart',
 'Jaylen Brown',
 'Semi Ojeleye',
 'Al Horford',
 'Kyrie Irving',
 'Shane Larkin',
 'Jayson Tatum',
 'Aron Baynes']

In [14]:
final = grp.drop(list(set(grp.columns) - set(['gmDate', 'teamAbbr'])), axis=1)
final.drop(final.columns[0:1], axis=1)
final.iloc()[0]

# Get all rosters (30)
included_set = set()
rosters = {}
counter = 0
# while counter < 30:
while len(included_set) < 30:
    team_data = final.iloc()[counter]
    abbr = team_data['teamAbbr']
    date = team_data['gmDate']
    if abbr in included_set:
        counter += 1
        continue
    rosters[abbr] = get_roster_from_game(date, abbr)
    included_set.add(abbr)
    counter += 1

rosters # Map of team abbr to its roster in the first game of the season

{'ATL': ['Luke Babbitt',
  'Josh Magette',
  'Dewayne Dedmon',
  'Dennis SchrÃ¶der',
  'Marco Belinelli',
  'John Collins',
  'Kent Bazemore',
  'Mike Muscala',
  'Malcolm Delaney',
  'DeAndreâ\x80\x99 Bembry',
  'Ersan Ilyasova',
  'Taurean Prince'],
 'BKN': ['Rondae Hollis-Jefferson',
  'DeMarre Carroll',
  'Jeremy Lin',
  'Caris LeVert',
  'Spencer Dinwiddie',
  'Timofey Mozgov',
  "D'Angelo Russell",
  'Allen Crabbe',
  'Quincy Acy',
  'Trevor Booker'],
 'BOS': ['Terry Rozier',
  'Gordon Hayward',
  'Marcus Smart',
  'Jaylen Brown',
  'Semi Ojeleye',
  'Al Horford',
  'Kyrie Irving',
  'Shane Larkin',
  'Jayson Tatum',
  'Aron Baynes'],
 'CHA': ['Kemba Walker',
  'Julyan Stone',
  'Malik Monk',
  'Dwayne Bacon',
  'Marvin Williams',
  'Frank Kaminsky',
  'Jeremy Lamb',
  'Dwight Howard',
  'Cody Zeller',
  'Treveon Graham'],
 'CHI': ['Quincy Pondexter',
  'Lauri Markkanen',
  'David Nwaba',
  'Robin Lopez',
  'Antonio Blakeney',
  'Ryan Arcidiacono',
  'Justin Holiday',
  'Paul Zip

In [15]:
# Map players to their numerical representation to get individual players on a team
numerical_rosters = {}
for team in rosters.keys():
    team_arr = []
    for player in rosters[team]:
        team_arr.append(all_stats[player].to_training_data())
    numerical_rosters[team] = team_arr
    
print(numerical_rosters)

{'SAC': [[8.716666666666667, 1.2, 4.8, 0.4166666666666667, 0.8166666666666667, 7.633333333333334, 0.4286716666666667], [11.561643835616438, 4.383561643835616, 2.7945205479452055, 0.9452054794520548, 0.273972602739726, 10.917808219178083, 0.4010342465753425], [5.396551724137931, 1.1896551724137931, 2.5517241379310347, 0.7241379310344828, 0.4482758620689655, 4.896551724137931, 0.35590172413793103], [13.4875, 1.9375, 3.825, 1.075, 0.275, 11.6625, 0.43085375], [9.955223880597014, 2.7761194029850746, 2.701492537313433, 0.9253731343283582, 0.3880597014925373, 7.641791044776119, 0.43090746268656716], [6.661764705882353, 1.088235294117647, 2.75, 0.4117647058823529, 0.19117647058823528, 6.147058823529412, 0.42133970588235287], [3.423076923076923, 0.5, 1.3076923076923077, 0.38461538461538464, 0.038461538461538464, 3.4615384615384617, 0.2744576923076923], [8.415384615384616, 1.9076923076923078, 2.3230769230769233, 0.9076923076923077, 0.4, 7.3076923076923075, 0.39068153846153847], [6.7183098591549

### Route 1: Take Average of Everything

Look to average all of the fields for each player into one array to represent the team.

In [16]:
team_player_avg = {}
for team in numerical_rosters.keys():
    team_arr = [1] * len(numerical_rosters[team][0])
    for player in numerical_rosters[team]:
        for stat in range(len(player)):
            team_arr[stat] += player[stat]
    for stat in range(len(team_arr)):
        team_arr[stat] /= len(numerical_rosters[team])
    team_player_avg[team] = team_arr

print(team_player_avg)

{'SAC': [8.81032454981881, 1.9564280317282141, 3.768769514776557, 0.853500509838471, 0.5186426656410877, 7.672248961937569, 0.5188372116256967], 'LAL': [9.299583651135025, 2.391165834378709, 4.156941145887164, 0.8767551569946384, 0.5567243187687242, 7.728686180452577, 0.5192023511765137], 'GS': [10.676759147173087, 2.7468047033616583, 3.9853762558268753, 0.8181133563282773, 0.72503222816502, 7.901903515013967, 0.5642001320066571], 'OKC': [9.53389533649696, 1.944306033639643, 4.18097803409498, 0.899835652301962, 0.5739983317203947, 7.808187812726408, 0.5038867929443032], 'DET': [10.06999754209552, 2.3099755890407416, 4.5998332853278825, 0.8548012346230955, 0.5023350002364803, 8.788083584472055, 0.49987110488886694], 'CHI': [7.649055781307486, 1.7920460745006723, 3.4103954254635216, 0.6291367081344843, 0.34891011299368047, 6.738006904008174, 0.46429392529389424], 'POR': [8.39025856399022, 1.7002088063952967, 4.00480815518657, 0.7079808472142411, 0.5259775920191919, 6.799826674406501, 0.4

In [17]:
# Duplicate for OPP
team_and_opp = {}
for team in team_player_avg.keys():
    team_and_opp["team_" + team] = team_player_avg[team]
    team_and_opp["oppt_" + team] = team_player_avg[team]
team_and_opp

# Flatten to 1D
# get list of games and dates
team_stat_arrs = []
results = []
for row in X.iterrows():
    index = row[0]
    teams = []
    number_teams = row[1].tolist()
#     print(list(X.columns), row[1].tolist())
    for i in range(len(number_teams)):
        if number_teams[i]:
            teams.append(X.columns[i])
    stat_arr = []
    for team in teams:
        stat_arr += team_and_opp[team]
    team_stat_arrs.append(stat_arr)
    result = team_stats.teamRslt[index]
    results.append(result)

# Team stats with team and opp
# print(team_stat_arrs)
# Result bool
# print(results)

X_train, X_test, y_train, y_test = train_test_split(team_stat_arrs, results, test_size=0.10, random_state=1)

model = GradientBoostingClassifier()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6747967479674797

### Route 2: Leaving player stats unaveraged

In [40]:
team_players = {}
min_reg_players = 8

for team in numerical_rosters.keys():
    team_arr = []
    for player in numerical_rosters[team][:min_reg_players]:
        team_arr += player
    team_players[team] = team_arr

team_and_opp = {}
for team in team_players.keys():
    team_and_opp["team_" + team] = team_players[team]
    team_and_opp["oppt_" + team] = team_players[team]
# print(team_and_opp)

# Flatten to 1D
# get list of games and dates
team_stat_arrs = []
results = []
for row in X.iterrows():
    index = row[0]
    teams = []
    number_teams = row[1].tolist()
#     print(list(X.columns), row[1].tolist())
    for i in range(len(number_teams)):
        if number_teams[i]:
            teams.append(X.columns[i])
    stat_arr = []
    for team in teams:
        stat_arr += team_and_opp[team]
    team_stat_arrs.append(stat_arr)
    result = team_stats.teamRslt[index]
    results.append(result)

# Team stats with team and opp
# print(team_stat_arrs[0])
# Result bool
# print(results)

X_train, X_test, y_train, y_test = train_test_split(team_stat_arrs, results, test_size=0.10, random_state=1)

# Note beats baseline by 1.2%
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6463414634146342

In [19]:
# Model Params / Features Definition