In [2]:
import pandas as pd
import numpy as np
from IPython.display import display
from functools import reduce
pd.options.display.max_columns = None


team_map = {
    'Philadelphia':'PHI', 
    'Boston':'BOS', 
    'Golden State':'GSW', 
    'Oklahoma City':'OKC', 
    'Milwaukee':'MIL', 
    'Charlotte':'CHA', 
    'Detroit':'DET', 
    'Brooklyn':'BKN', 
    'Indiana':'IND',
    'Memphis':'MEM', 
    'Miami':'MIA', 
    'Orlando':'ORL', 
    'Atlanta':'ATL', 
    'New York':'NYK', 
    'Cleveland':'CLE', 
    'Toronto':'TOR', 
    'New Orleans':'NOP', 
    'Houston':'HOU',
    'Minnesota':'MIN', 
    'San Antonio':'SAS', 
    'Utah':'UTA', 
    'Sacramento':'SAC', 
    'LA Clippers':'LAC', 
    'Denver':'DEN', 
    'Dallas':'DAL', 
    'Phoenix':'PHX', 
    'Chicago':'CHI',
    'Washington':'WAS', 
    'Portland':'POR', 
    'LA Lakers':'LAL'
}

In [14]:
"""
HTML style plug in for adding a title to the display table
"""
from jinja2 import Environment, ChoiceLoader, FileSystemLoader
from IPython.display import HTML
from pandas.io.formats.style import Styler

class TableStyler(Styler):
    env = Environment(
        loader=ChoiceLoader([
            FileSystemLoader("templates"),  # contains ours
            Styler.loader,  # the default
        ])
    )
    template = env.get_template("table.html")
    
def show(df, title=None):
    return HTML(TableStyler(df).render(
        table_title=title
    ))

In [7]:
"""
loading the data frames from csvs on s3
"""
import boto3
import io

s3 = boto3.client('s3')
bucket = 'nba-f2ff3e86-7bb1-4501-9a41-df1e7f7e8b71'

# 2018-19 NBA schedule data
obj = s3.get_object(Bucket=bucket, Key='NBA-2018-19-HISTORICAL-SCHEDULE-Table 1.csv')
sched_df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding = 'unicode_escape')

# 2018-19 NBA play by play data
obj = s3.get_object(Bucket=bucket, Key='[10-16-2018]-[06-13-2019]-combined-stats.csv')
games_df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding = 'unicode_escape')


  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# formatting game ids to achieve a proper join
games_df.game_id = games_df.game_id.apply(lambda game_id: ''.join(filter(lambda x: x.isdigit(), game_id)))
sched_df["game_id"] = sched_df["GAME ID"].apply(lambda x: "00"+str(x))

game = pd.merge(games_df, sched_df, how='inner', left_on="game_id", right_on="game_id")


In [10]:
# filtering data set to only be regular season
pbp_df = game[game.data_set=="2018-2019 Regular Season"].copy()
sch_df = sched_df[sched_df["DATASET"]=="NBA 2018-2019 Regular Season"].copy()

## Chapter 2: Watching a Game
<i>For games involving running or trapping teams, that number of possessions will be high--more than one hundred of 110 in the NBA. For walk-it-up teams, that number of possessions will be low--sometimes fewer than eighty. At the endof the season the fast teams will have more possessions than the slow teams, but both with have the same number as their cumulative opponents"</i> (Oliver p. 24)

In [11]:
# clean up the DF to get possession stats
pace_df = pbp_df[["team", "event_type", "type"]].copy()
pace_df['possession'] = pace_df.apply(lambda x: x.type if x.event_type == "rebound" else x.event_type, axis=1)
pace_df = pace_df.groupby(["team","possession"]).size().unstack()

In [12]:
# define variables
fga = pace_df["miss"]+pace_df["shot"]
oreb = pace_df["rebound offensive"]
ddreb = pace_df["rebound defensive"]
fgm = pace_df["shot"]
tov = pace_df["turnover"]
fta = pace_df["free throw"]

Equation for possessions (Oliver p. 24)
$$Possessions = FGA - \frac{OREB}{OREB + DDREB} \times (FGA-FGM) \times 1.07+TOV+0.4 \times FTA$$ 


In [15]:
# implement the possession formula 
possessions = fga - (oreb/(oreb+ddreb))*(fga-fgm)*1.07+tov+.4*fta
possessions.name = "possessions"
pace = possessions/82
pace.name = "pace"
pace = pace.sort_values(ascending=False).to_frame()
show(pace, "Pace by team 2018-19 season")

# this formula is actually incorrect because it assumes every game is 48 mins 
# and doesn't take into account overtime minutes played

Unnamed: 0_level_0,pace
team,Unnamed: 1_level_1
ATL,104.137
LAL,103.819
OKC,103.241
NOP,102.648
PHI,102.322
LAC,102.166
SAC,102.124
BKN,101.851
MIL,101.627
WAS,101.593


## Chapter 3: The Best Offenses and Defenses
A lot could be done with this chapter especially if there was more than 1 year of play by play data. I'm choosing to take the easy route here and just do some basic offensive and defensive ratings that emulate the more in depth comparisons done in chapter 3.

In [16]:
def color_negative_red(val):
    color = 'red' if val < 0 else 'black'
    return 'color: %s' % color

In [17]:
sched_copy = sch_df.copy()
sched_copy = sched_copy.rename(columns={
    "HOME TEAM": "home", "ROAD TEAM": "road", 
    "ROAD TEAM FINAL SCORE": "road_score", "HOME TEAM FINAL SCORE": "home_score" 
    }
)
scores = sched_copy[["home", "road", "road_score", "home_score"]].copy()
scores["home"] = scores.apply(lambda x: team_map[x.home], axis=1)
scores["road"] = scores.apply(lambda x: team_map[x.road], axis=1)

home_scores = scores[["home", "home_score", "road_score"]].copy().rename(columns={"home": "team", "home_score": "off_pts", "road_score": "def_pts"})
road_scores = scores[["road", "road_score", "home_score"]].copy().rename(columns={"road": "team", "road_score": "off_pts", "home_score": "def_pts"})
all_scores = pd.concat([road_scores, home_scores], sort=False).groupby(["team"]).sum()

all_scores = all_scores.merge(possessions.to_frame(), left_index=True, right_index=True)

# Calculating offensive and defensive ratings
off_rating = all_scores.off_pts / (all_scores.possessions/100)
off_rating.name = "offensive rating"
def_rating = all_scores.def_pts / (all_scores.possessions/100)
def_rating.name = "defensive rating"

team_ratings = reduce(lambda left,right: pd.merge(left, right, left_index=True, right_index=True), [pace, off_rating, def_rating])
team_ratings["net rating"] = team_ratings["offensive rating"] - team_ratings["defensive rating"]

avg_off = team_ratings["offensive rating"].mean()
avg_def = team_ratings["defensive rating"].mean()

team_ratings["off rating above leage average"] = team_ratings["offensive rating"] - avg_off
team_ratings["def rating below leage average"] = avg_def - team_ratings["defensive rating"]

# display
team_ratings.sort_values("net rating", ascending=False).round(decimals=1).style.applymap(color_negative_red)

Unnamed: 0_level_0,pace,offensive rating,defensive rating,net rating,off rating above leage average,def rating below leage average
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MIL,101.6,116.2,107.5,8.7,5.0,3.7
GSW,101.4,116.1,109.7,6.4,4.9,1.5
TOR,100.6,113.8,107.7,6.1,2.6,3.5
UTA,100.7,110.9,105.7,5.2,-0.3,5.5
HOU,97.3,117.1,112.2,4.9,5.9,-1.0
BOS,99.3,113.1,108.7,4.5,1.9,2.5
POR,100.5,114.1,109.9,4.2,2.9,1.3
DEN,98.1,112.8,108.8,4.0,1.6,2.4
IND,98.2,110.0,106.6,3.4,-1.2,4.6
OKC,103.2,110.9,107.6,3.3,-0.3,3.6


## Chapter 4: Reserve Your Playoff Tickets Now! We Won Three in a Row!
This chapter gets away from me a little bit here. I guess the point is that win streaks and final win percentage are correlated.

The first block of code does some basic calculations on the schedule data set to come up with final win/loss record by team. This is simply to have the win percentage to compare against teams respective max win streaks.

In [18]:
# team win loss records
win_loss = sch_df.copy()
win_loss["road_team"] = win_loss.apply(lambda x: team_map[x["ROAD TEAM"]], axis=1)
win_loss["home_team"] = win_loss.apply(lambda x: team_map[x["HOME TEAM"]], axis=1)
win_loss["winner"] = win_loss.apply(
    lambda x: 
        x.road_team if x["ROAD TEAM FINAL SCORE"] > x["HOME TEAM FINAL SCORE"]
        else x.home_team,
    axis=1
)

wins = win_loss.groupby("winner").size()
wins.name = "wins"
losses = 82 - wins
losses.name = "losses"

records = pd.merge(wins, losses, right_index=True, left_index=True)
records["win_pct"] = records.wins / 82
records = records.sort_values("win_pct", ascending=False)
show(records, "Team records for the 2018-19 season")

Unnamed: 0_level_0,wins,losses,win_pct
winner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIL,60,22,0.731707
TOR,58,24,0.707317
GSW,57,25,0.695122
DEN,54,28,0.658537
HOU,53,29,0.646341
POR,53,29,0.646341
PHI,51,31,0.621951
UTA,50,32,0.609756
BOS,49,33,0.597561
OKC,49,33,0.597561


This function does too much but the gist is that given the win/loss data we can calculate the rolling average using the `cumsum` method as the total wins and the index number as the total games.

In [20]:
def get_win_streaks(team_name):
    # win streaks
    win_loss["DATE"] = pd.to_datetime(win_loss["DATE"])
    gsw = win_loss[(win_loss.road_team == team_name)|(win_loss.home_team==team_name)].sort_values("DATE").reset_index()

    # Calculating Rolling Average
    wins = (gsw.winner == team_name).astype(int).cumsum()
    wins.name = "win"
    win_pct = wins.to_frame().reset_index().apply(lambda x: x.win/(x["index"]+1), axis=1).to_frame(name="win pct")

    # calculating win streaks
    gsw_wins = (gsw.winner==team_name).astype(int).to_frame().copy()
    gsw_wins.name = "wins"
    streaks = gsw_wins.groupby((gsw_wins.winner != gsw_wins.winner.shift(1)).cumsum()).cumcount()+1
    streaks.name = "win_streaks"
    wins_combined = pd.merge(streaks, gsw_wins, left_index=True, right_index=True)
    final_streaks = wins_combined.win_streaks * wins_combined.winner
    final_streaks.name = "win streak"

    record_with_streaks = reduce(
        lambda left,right: pd.merge(left, right, left_index=True, right_index=True), 
        [gsw, win_pct, final_streaks])
    return record_with_streaks[["GAME ID", "home_team", "road_team", "winner", "win pct", "win streak"]]

# aggregate all the teams streak in to data frames
all_streaks = {team: get_win_streaks(team) for team in team_map.values()}

# visualize a teams schedule with the rolling average and the win streaks:
all_streaks["GSW"]

Unnamed: 0,GAME ID,home_team,road_team,winner,win pct,win streak
0,21800002,GSW,OKC,GSW,1.000000,1
1,21800024,UTA,GSW,GSW,1.000000,2
2,21800038,DEN,GSW,DEN,0.666667,0
3,21800047,GSW,PHX,GSW,0.750000,1
4,21800062,GSW,WAS,GSW,0.800000,2
5,21800068,NYK,GSW,GSW,0.833333,3
6,21800083,BKN,GSW,GSW,0.857143,4
7,21800091,CHI,GSW,GSW,0.875000,5
8,21800108,GSW,NOP,GSW,0.888889,6
9,21800124,GSW,MIN,GSW,0.900000,7


In [21]:
max_streaks = {team: team_schedule["win streak"].max() for team,team_schedule in all_streaks.items()}

streak_records = pd.merge(records, pd.Series(max_streaks, name="longest streak"), left_index=True, right_index=True)\
    .sort_values("longest streak", ascending=False)

show(streak_records, "Team records and longest win streaks in the 2018-19 season")

Unnamed: 0,wins,losses,win_pct,longest streak
GSW,57,25,0.695122,11
HOU,53,29,0.646341,9
SAS,48,34,0.585366,9
BOS,49,33,0.597561,8
TOR,58,24,0.707317,8
OKC,49,33,0.597561,7
BKN,42,40,0.512195,7
IND,48,34,0.585366,7
MIL,60,22,0.731707,7
UTA,50,32,0.609756,7


## Chapter 5: Teamwork
In the gray box at the end of the chapter Oliver compares the various player rating equations. 

In [22]:
"""Curating all the data needed to calculate player ratings"""
player_df = pbp_df[["player", "game_id", "event_type", "assist", "block", "points", "steal", "description", "type"]].copy()
player_df["missed_free_throw"] = player_df.apply(
    lambda x: 1 if (x["event_type"] == "free throw") & (x["points"] == 0.0) else 0,
    axis=1
)
player_df["event_type"] = player_df.apply(lambda x: x["type"] if x["event_type"] == "rebound" else x["event_type"], axis=1)


events = player_df.groupby(["player", "event_type"]).size()
missed_free_throw = player_df[player_df["missed_free_throw"]==1].groupby(["player", "missed_free_throw"]).size().to_frame(name="missed_ft").droplevel(1)
assists = player_df.groupby(["assist"]).size().sort_values(ascending=False).to_frame(name="ast")
blocks = player_df.groupby(["block"]).size().sort_values(ascending=False).to_frame(name="blk")
steals = player_df.groupby(["steal"]).size().sort_values(ascending=False).to_frame(name="stl")
points = player_df.groupby(["player"])["points"].sum().sort_values(ascending=False).to_frame(name="pts")

def_rebounds = events[events.index.get_level_values('event_type') == "rebound defensive"].sort_values(ascending=False).to_frame(name="dreb").droplevel(1)
off_rebounds = events[events.index.get_level_values('event_type') == "rebound offensive"].sort_values(ascending=False).to_frame(name="oreb").droplevel(1)
missed_fgs = events[events.index.get_level_values('event_type') == "miss"].sort_values(ascending=False).to_frame(name="missed_fg").droplevel(1)
turnovers = events[events.index.get_level_values('event_type') == "turnover"].sort_values(ascending=False).to_frame(name="tov").droplevel(1)

player_stats = team_ratings = reduce(
    lambda left,right: pd.merge(left, right, left_index=True, right_index=True),
    [points, assists, def_rebounds, off_rebounds, steals, blocks, missed_fgs, turnovers, missed_free_throw]
)

games_played = player_df.groupby(["player"])["game_id"].nunique().to_frame(name="gp")

### Manley's Credits
The starting point for player value calculations. <br>
<i>His form is the simplest form, where the weights are all equal to one</i> (Oliver p. 82)

$$VALUE = PTS + REB + AST + STL + BLK - TOV - Missed FG - Missed FT$$ 

In [23]:
ps = player_stats

manley = ps["pts"] + (ps["dreb"] + ps["oreb"]) + ps["ast"] + ps["stl"] + ps["blk"] - ps["tov"] - ps["missed_fg"] - ps["missed_ft"]
manley.sort_values(ascending=False).to_frame(name="player value")

Unnamed: 0,player value
James Harden,2580.0
Giannis Antetokounmpo,2484.0
Karl-Anthony Towns,2338.0
Nikola Jokic,2311.0
Nikola Vucevic,2245.0
Kevin Durant,2181.0
Rudy Gobert,2179.0
Andre Drummond,2156.0
Russell Westbrook,2120.0
Paul George,2111.0


<img src="images/value_weights.png">

In [24]:
weights_dict = {
    "manley": [1,1,1,1,1,1,-1,-1,-1,0],
    "hoopstat": [1,1.39,1.18,.69,1.39,1.94,-.83,0,-1.11,0],
    "steele": [1,1.25,1,1,1.25,1,-1,-.5,-1.25,-.5],
    "heeren": [1,1,1,1,1,1,-1,-1,-1,0],
    "belloti": [1,1.08,.92,.92,.92,.92,-.92,-.92,-.92,-.46],
    "claerbaut": [1,.63,.63,.63,.63,.63,-.63,-.24,-.63,0],
    "mays_magic": [1,.98,.71,.71,1.09,.87,-.71,-.55,-1.09,0],
    "schaller": [1,.9,.75,.75,1.8,1.1,-1,-.9,-1.8,-.6],
    "hollinger": [1,.79,.85,.35,1.2,.85,-.85,-.45,-1.2,-.41],
    "berri": [1,.92,3.82,1.71,2.44,.86,-1.38,-.79,-2.77,-.46],
}
columns = [
    "pts", "ast", "oreb", "dreb", "stl", "blk", "missed_fg", "missed_ft", "tov", "pf",
]
weights = pd.DataFrame.from_dict(weights_dict, orient="index", columns=columns)
weights

Unnamed: 0,pts,ast,oreb,dreb,stl,blk,missed_fg,missed_ft,tov,pf
manley,1,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,0.0
hoopstat,1,1.39,1.18,0.69,1.39,1.94,-0.83,0.0,-1.11,0.0
steele,1,1.25,1.0,1.0,1.25,1.0,-1.0,-0.5,-1.25,-0.5
heeren,1,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,0.0
belloti,1,1.08,0.92,0.92,0.92,0.92,-0.92,-0.92,-0.92,-0.46
claerbaut,1,0.63,0.63,0.63,0.63,0.63,-0.63,-0.24,-0.63,0.0
mays_magic,1,0.98,0.71,0.71,1.09,0.87,-0.71,-0.55,-1.09,0.0
schaller,1,0.9,0.75,0.75,1.8,1.1,-1.0,-0.9,-1.8,-0.6
hollinger,1,0.79,0.85,0.35,1.2,0.85,-0.85,-0.45,-1.2,-0.41
berri,1,0.92,3.82,1.71,2.44,0.86,-1.38,-0.79,-2.77,-0.46


In [25]:
columns = list(player_stats.columns)
ratings = {}
for row in weights.iterrows():
    ratings[row[0]] = \
        player_stats.apply(lambda x: x*row[1][x.name]).sum(axis=1)
    
player_values = reduce(lambda left,right: pd.merge(left, right, left_index=True, right_index=True), 
                       [rating_value.to_frame(name=rating_name) for rating_name, rating_value in ratings.items()])


In [26]:
player_values.head(100)

Unnamed: 0,manley,hoopstat,steele,heeren,belloti,claerbaut,mays_magic,schaller,hollinger,berri
James Harden,2580.0,3039.55,2721.75,2580.0,2692.80,2708.62,2746.74,2226.90,2317.69,2192.93
Paul George,2111.0,2407.83,2224.75,2111.0,2165.72,2162.30,2216.68,1906.45,1860.27,2303.25
Kemba Walker,1849.0,2231.17,1977.50,1849.0,1946.52,1969.91,2028.00,1633.40,1692.82,1598.31
Bradley Beal,1983.0,2380.55,2113.00,1983.0,2063.96,2059.46,2122.48,1770.25,1812.42,1895.00
Damian Lillard,2002.0,2367.45,2131.50,2002.0,2095.36,2043.60,2135.36,1763.55,1802.14,1798.80
Kevin Durant,2181.0,2469.59,2283.25,2181.0,2241.96,2146.64,2220.02,1893.15,1863.87,2004.20
Giannis Antetokounmpo,2484.0,2821.01,2632.75,2484.0,2506.72,2354.71,2420.80,2116.65,2029.65,2904.88
Stephen Curry,1806.0,2039.65,1883.50,1806.0,1869.76,1843.11,1895.76,1603.80,1609.04,1659.65
Karl-Anthony Towns,2338.0,2572.10,2396.50,2338.0,2342.80,2197.40,2241.21,1954.85,1877.81,2982.95
Blake Griffin,1778.0,2085.83,1896.25,1778.0,1847.36,1854.35,1854.67,1451.90,1514.38,1736.05


## Chapter 6: Rebounding Myths and Roles

In [27]:
game_box = pbp_df.copy()

game_box["game_id"] = game_box["game_id"].str[2:]
game_box["event_type"] = game_box.apply(lambda x: x["type"] if x["event_type"] == "rebound" else x["event_type"], axis=1)

# get 3pt stats
game_box["3pt_miss"] = game_box.apply(
    lambda x: 
        x.player if "3PT" in str(x.description) and x.event_type == "miss" else np.nan,
    axis=1
)
game_box["3pt_make"] = game_box.apply(
    lambda x: 
        x.player if "3PT" in str(x.description) and x.event_type == "shot" else np.nan,
    axis=1
)

In [28]:
per_gam_stats = game_box.groupby(["game_id", "team", "event_type"]).size().to_frame("count").reset_index()

per_game_pivot = pd.pivot_table(per_gam_stats,index=["game_id", "team"],values=["count"],
               columns=["event_type"],
               fill_value=0)

per_game_pivot.loc[:, ("count","fg_pct")] = per_game_pivot["count"]["shot"] / (per_game_pivot["count"]["shot"] + per_game_pivot["count"]["miss"])

per_game_pivot.columns = per_game_pivot.columns.droplevel()

for stat in ["assist", "block", "steal", "3pt_miss", "3pt_make"]:
    aggregate = game_box.groupby(["game_id", "team", stat]).size().to_frame("count").reset_index().groupby(["game_id", "team"]).sum()
    aggregate = aggregate.rename(columns={"count": stat})
    per_game_pivot = pd.merge(per_game_pivot, aggregate, left_index=True, right_index=True)

per_game_pivot["3pt_pct"] = per_game_pivot["3pt_make"] / (per_game_pivot["3pt_make"] + per_game_pivot["3pt_miss"])
per_game_pivot["rebound"] = per_game_pivot["rebound defensive"] + per_game_pivot["rebound offensive"]     


In [29]:
wins = win_loss[["GAME ID", "winner"]].reset_index()
wins["GAME ID"] = wins["GAME ID"].astype(str)
edge = per_game_pivot.groupby(['game_id']).diff(periods=-1).reset_index()
edge = pd.merge(wins, edge, left_on="GAME ID", right_on="game_id")
edge["win"] = edge.apply(lambda x: 1 if x["winner"] == x["team"] else 0, axis=1)
edge = edge.drop(columns=["index", "GAME ID"]).set_index(["game_id", "team", "winner"])
edge = edge[edge.ejection.notnull()]
edge = edge.apply(lambda x: np.sign(x))

In [30]:
stat_record = pd.DataFrame(columns=["name", "win", "tie", "loss"])
stat_columns = [c for c in edge.columns if c != "win"]
for column in stat_columns:
    gb = edge[[column, "win"]].groupby(column).sum()
    gb = gb.transpose().reset_index()
    gb = gb.rename(columns={
                        -1:"loss",
                        0:"tie",
                        1:"win",})
    gb["name"] = column
    stat_record = pd.concat([stat_record, gb], sort=False)
    
stat_record["win_pct"] = stat_record["win"] / (stat_record["win"] + stat_record["loss"])
stat_edge_win_pct = stat_record.sort_values("win_pct", ascending=False).reset_index()[["name", "win", "tie", "loss", "win_pct"]]

show(stat_edge_win_pct, "Table 6.1 Winning Percentage with Statistical Edge over Opponent 2018-19 season")

Unnamed: 0,name,win,tie,loss,win_pct
0,shot,395,40,118,0.769981
1,fg_pct,419,2,132,0.760436
2,3pt_pct,406,5,142,0.740876
3,rebound defensive,385,30,138,0.736138
4,3pt_make,350,44,159,0.687623
5,rebound,355,27,171,0.674905
6,assist,354,25,174,0.670455
7,sub,292,47,214,0.577075
8,free throw,302,25,226,0.57197
9,jump ball,244,106,203,0.545861


## Chapter 7: Stephen Curry's Significance

In [31]:
gsw_games = sch_df[(sch_df["ROAD TEAM"] == "Golden State") | (sch_df["HOME TEAM"] == "Golden State")]
gsw_game_ids = gsw_games["GAME ID"]
gsw_pbp = pbp_df[pbp_df["game_id"].astype(int).isin(gsw_game_ids.values)].copy()
gsw_pbp["all_players"] = gsw_pbp[["a1","a2","a3","a4","a5","h1","h2","h3","h4","h5",]].values.tolist()

In [41]:
steph_game_ids = gsw_pbp[gsw_pbp.all_players.apply(lambda x: "Stephen Curry" in x)]["game_id"].unique().astype(int)

steph_games = gsw_games[gsw_games["GAME ID"].isin(steph_game_ids)].copy()
steph_pbp = gsw_pbp[gsw_pbp["game_id"].isin(steph_game_ids)].copy()
non_steph_games = gsw_games[-gsw_games["GAME ID"].isin(steph_game_ids)].copy()
steph_pbp = gsw_pbp[-gsw_games["game_id"].isin(steph_game_ids)].copy()

def player_effect(player_games):
    player_games["game"] = 1
    player_games["winner"] = player_games.apply(
        lambda x:
            x["ROAD TEAM"] if x["ROAD TEAM FINAL SCORE"] > x["HOME TEAM FINAL SCORE"] else x["HOME TEAM"]
        , axis=1
    )
    player_games["win"] = player_games["winner"] == "Golden State"
    player_games["warriors_points"] = player_games.apply(
        lambda x:
            x["ROAD TEAM FINAL SCORE"] if x["ROAD TEAM"] == "Golden State" else x["HOME TEAM FINAL SCORE"]
        , axis=1
    )
    player_games["non_warriors_points"] = player_games.apply(
        lambda x:
            x["ROAD TEAM FINAL SCORE"] if x["ROAD TEAM"] != "Golden State" else x["HOME TEAM FINAL SCORE"]
        , axis=1
    )

    player_stats = player_games.agg({'win': 'sum', 'warriors_points': 'sum', 'non_warriors_points': 'sum', 'game': 'sum'})
    player_pbp = gsw_pbp[gsw_pbp["game_id"].astype(int).isin(steph_game_ids)].copy()

    player_pace = player_pbp[["team", "event_type", "type"]].copy()
    player_pace['possession'] = player_pace.apply(lambda x: x.type if x.event_type == "rebound" else x.event_type, axis=1)
    player_pace = player_pace.groupby(["team","possession"]).size().unstack()
    player_pace = player_pace.loc["GSW"]

    fga = player_pace["miss"]+player_pace["shot"]
    oreb = player_pace["rebound offensive"]
    ddreb = player_pace["rebound defensive"]
    fgm = player_pace["shot"]
    tov = player_pace["turnover"]
    fta = player_pace["free throw"]

    player_possessions = fga - (oreb/(oreb+ddreb))*(fga-fgm)*1.07+tov+.4*fta

    off_pts_per = player_stats["warriors_points"]/player_possessions*100
    def_pts_per = player_stats["non_warriors_points"]/player_possessions*100
    win_loss = f"{player_stats['win']}-{player_stats['game']-player_stats['win']}"
    return [win_loss, round(off_pts_per, 2), round(def_pts_per, 2)]


  after removing the cwd from sys.path.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

In [40]:
player_effect(steph_games)

['52-17', 117.12, 108.66]