In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
player_data = pd.read_csv('files/players.csv')

In [5]:
game_details = pd.read_csv('files/games_details.csv')
full_games = pd.read_csv('files/games.csv')

  game_details = pd.read_csv('files/games_details.csv')


Removes all NA or None values in the columns when a player didn't play, or when a player had a DNP

In [6]:
for col in game_details.columns:
    if col in ['MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']:
        game_details[col] = game_details.get(col).fillna(0)

In [7]:
def game_score_formula(pts, fg, fga, fta, ft, orb, drb, stl, ast, blk, pf, tov):
    """ 
    A measure of how well a player is doing in a single game.
    Created by John Hollinger to give rough measure of how well each
    singular player did in a game(40 is a great game, 10 is average)
    """
    return (pts) + (0.4 * fg) - (0.7 * fga) + (0.4 * (fta-ft)) + (0.7 * orb) + (0.3 * drb) + stl + (0.7 * ast) + (0.7 * blk) - (0.4 * pf) - tov

In [8]:
game_score_req = np.array(['PTS', 'FGM', 'FGA', 'FTA', 'FTM', 'OREB', 'DREB', 'STL', 'AST', 'BLK', 'PF', 'TO'])
score_reqs = game_details.get(game_score_req)

Get info on whether columns within game_details are integers, objects, floats, or anything else that is necessary

In game details, the columns that are necessary for data analysis are TEAM_ABBREVIATION, PLAYER_NAME, START_POSITION, MIN, FGM, FGA. FG_PCT, FG3M, FG3A, FG_PCT, FTM, FTA, FT_PCT, OREB, DREB, REB, AST, STL, BLK, TO, PF, PTS, PLUS_MINUS, and GAME_SCORE

We adjust the MIN column to actual minutes, and add the GAME_SCORE column to calculate game score

In [9]:
game_details['GAME_SCORE'] = score_reqs.get(game_score_req).apply(lambda x: game_score_formula(*x), axis = 1)
game_details.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,...,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_SCORE
0,22200477,1610612759,SAS,San Antonio,1629641,Romeo Langford,Romeo,F,,18:06,...,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,-2.0,-0.3
1,22200477,1610612759,SAS,San Antonio,1631110,Jeremy Sochan,Jeremy,F,,31:01,...,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0,-14.0,25.1
2,22200477,1610612759,SAS,San Antonio,1627751,Jakob Poeltl,Jakob,C,,21:42,...,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,-4.0,8.8
3,22200477,1610612759,SAS,San Antonio,1630170,Devin Vassell,Devin,G,,30:20,...,9.0,9.0,5.0,3.0,0.0,2.0,1.0,10.0,-18.0,9.3
4,22200477,1610612759,SAS,San Antonio,1630200,Tre Jones,Tre,G,,27:44,...,2.0,2.0,3.0,0.0,0.0,2.0,2.0,19.0,0.0,13.3


Creates dataframe 'necessary_data' that are needed to find out who the best players in games typically are, based on sorting by each game and finding out the 5 players who had the best game score are.

In [10]:
necessary_data = game_details.drop(['TEAM_ID', 'TEAM_CITY', 'PLAYER_ID', 'FG3_PCT', 'NICKNAME', 'COMMENT'], axis = 1)
necessary_data.head()

Unnamed: 0,GAME_ID,TEAM_ABBREVIATION,PLAYER_NAME,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_SCORE
0,22200477,SAS,Romeo Langford,F,18:06,1.0,1.0,1.0,0.0,0.0,...,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,-2.0,-0.3
1,22200477,SAS,Jeremy Sochan,F,31:01,7.0,14.0,0.5,2.0,4.0,...,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0,-14.0,25.1
2,22200477,SAS,Jakob Poeltl,C,21:42,6.0,9.0,0.667,0.0,0.0,...,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,-4.0,8.8
3,22200477,SAS,Devin Vassell,G,30:20,4.0,13.0,0.308,1.0,6.0,...,9.0,9.0,5.0,3.0,0.0,2.0,1.0,10.0,-18.0,9.3
4,22200477,SAS,Tre Jones,G,27:44,7.0,12.0,0.583,1.0,3.0,...,2.0,2.0,3.0,0.0,0.0,2.0,2.0,19.0,0.0,13.3


Tried to find out the best 5 players in every single game depending on their game score. Sorted through the GAME_ID(s)(for individual games), and found out their game score and sorted based on the five highest

In [11]:
best_players_by_game_score = pd.DataFrame(columns = necessary_data.columns)


In [12]:
for game_id in full_games[full_games.get('GAME_DATE_EST').str.contains('2022|2021|2020')].get('GAME_ID'):
    best_players_by_game_score = pd.concat([best_players_by_game_score, necessary_data[necessary_data.get('GAME_ID') == game_id].sort_values('GAME_SCORE', ascending = False).head(5)])

In [13]:
necessary_data[necessary_data.get('GAME_ID') == game_id].sort_values('GAME_SCORE', ascending = False).head(5)

Unnamed: 0,GAME_ID,TEAM_ABBREVIATION,PLAYER_NAME,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_SCORE
101755,21900507,LAL,LeBron James,F,38:04,11.0,21.0,0.524,1.0,4.0,...,11.0,13.0,12.0,2.0,1.0,5.0,0.0,31.0,18.0,33.1
101742,21900507,PHX,Kelly Oubre Jr.,F,36:19,9.0,14.0,0.643,3.0,5.0,...,3.0,6.0,3.0,3.0,0.0,1.0,4.0,26.0,-16.0,26.5
101756,21900507,LAL,Anthony Davis,F,37:55,9.0,16.0,0.563,1.0,3.0,...,10.0,11.0,2.0,0.0,1.0,3.0,1.0,26.0,16.0,21.2
101745,21900507,PHX,Devin Booker,G,37:14,11.0,23.0,0.478,1.0,3.0,...,2.0,2.0,7.0,1.0,0.0,6.0,3.0,32.0,-13.0,20.0
101759,21900507,LAL,Avery Bradley,G,30:04,9.0,11.0,0.818,0.0,2.0,...,4.0,5.0,1.0,3.0,0.0,1.0,2.0,18.0,23.0,17.7


From 2020-2022, who are the players that have been the most consistent(i.e. which players have had the most top 5 gamescores in their games?)

In [14]:
game_score_players = best_players_by_game_score.groupby('PLAYER_NAME').count().sort_values('GAME_ID', ascending = False).head(100)
game_score_players

Unnamed: 0_level_0,GAME_ID,TEAM_ABBREVIATION,START_POSITION,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FTM,...,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_SCORE
PLAYER_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Nikola Jokic,234,234,234,234,234,234,234,234,234,234,...,234,234,234,234,234,234,234,234,234,234
Giannis Antetokounmpo,225,225,225,225,225,225,225,225,225,225,...,225,225,225,225,225,225,225,225,225,225
Luka Doncic,210,210,210,210,210,210,210,210,210,210,...,210,210,210,210,210,210,210,210,210,210
Jayson Tatum,209,209,209,209,209,209,209,209,209,209,...,209,209,209,209,209,209,209,209,209,209
Trae Young,189,189,189,189,189,189,189,189,189,189,...,189,189,189,189,189,189,189,189,189,189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dennis Schroder,67,67,48,67,67,67,67,67,67,67,...,67,67,67,67,67,67,67,67,67,67
Myles Turner,65,65,65,65,65,65,65,65,65,65,...,65,65,65,65,65,65,65,65,65,65
Tyrese Maxey,65,65,62,65,65,65,65,65,65,65,...,65,65,65,65,65,65,65,65,65,65
Brook Lopez,64,64,64,64,64,64,64,64,64,64,...,64,64,64,64,64,64,64,64,64,64


From 2020-2022, which players averaged the highest amount of game_score(in top 5 appearances only) when they played?


In [15]:
highest_avg_game_score = best_players_by_game_score.groupby('PLAYER_NAME').mean().sort_values(by = 'GAME_SCORE', ascending = False)

  highest_avg_game_score = best_players_by_game_score.groupby('PLAYER_NAME').mean().sort_values(by = 'GAME_SCORE', ascending = False)


For players that are at least in the top 100 in games played in which they've had a top 5 game score, find the ones that have the highest average

To do this, we make go through the highest_game_score DataFrame and locate the player's names and look through the highest_avg_game_score DataFrame, which contains data on the average game_score as well as the average stats that the players contained in the games they played. By sorting through the players that had the most games in the top 5 of game score per game, we make sure that we don't have outlying players who may have only played 1 or 2 excellent games with the rest of their games being below average(meaning they didn't make the top 5 of a game)

In [16]:
best_players = pd.DataFrame(columns = highest_avg_game_score.columns)
for player in game_score_players.index: 
    if player in highest_avg_game_score.index:
            best_players = pd.concat([best_players, pd.DataFrame(highest_avg_game_score.loc[player]).T])

best_players['GAMES'] = game_score_players.loc[best_players.index].get('GAME_ID')

In [17]:
top_10_best_players = best_players.sort_values('GAME_SCORE', ascending = False).head(10).index

Just judging from average game score and how the amount of the times that each player did in the games they played(top 5 in game score in their games), it seems that the best and most consistent players in the league are Giannis Antetokounmpo, Joel Embiid, Nikola Jokic, Damian Lillard, Luka Doncic, Anthony Davis, James Harden, Kevin Durant, Stephen Curry, and Trae Young. But is this actually true? So I tested this with graphs detailing how well these players actually played and how their charts and plays may be misleading or false

In [18]:
 best_players

Unnamed: 0,FGM,FGA,FG_PCT,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_SCORE,GAMES
Nikola Jokic,10.064103,17.410256,0.591944,1.380342,3.653846,5.012821,6.008547,2.700855,9.021368,11.722222,7.957265,1.414530,0.777778,3.457265,2.799145,26.521368,4.863248,26.307265,234
Giannis Antetokounmpo,10.791111,19.462222,0.559893,1.044444,3.711111,7.262222,10.702222,2.053333,10.195556,12.248889,5.724444,0.995556,1.182222,3.426667,3.111111,29.888889,6.608889,27.612889,225
Luka Doncic,10.385714,21.623810,0.480252,3.090476,8.628571,5.995238,8.252381,0.885714,7.923810,8.809524,8.623810,1.171429,0.533333,4.147619,2.414286,29.857143,3.290476,25.242857,210
Jayson Tatum,10.081340,20.842105,0.486148,3.478469,8.636364,5.846890,6.894737,0.918660,7.047847,7.966507,4.464115,1.162679,0.775120,2.803828,2.167464,29.488038,8.000000,23.266986,209
Trae Young,9.423280,20.365079,0.465519,3.031746,8.037037,8.179894,9.121693,0.698413,2.973545,3.671958,9.804233,0.862434,0.164021,3.968254,1.761905,30.058201,3.735450,24.496825,189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dennis Schroder,7.820896,15.029851,0.527313,2.164179,4.820896,4.253731,4.940299,0.477612,3.358209,3.835821,5.805970,1.089552,0.238806,2.417910,2.358209,22.059701,3.820896,18.243284,67
Myles Turner,6.753846,11.153846,0.610123,2.092308,4.353846,2.923077,3.800000,1.815385,6.676923,8.492308,1.369231,0.876923,2.846154,1.400000,3.015385,18.523077,4.107692,18.263077,65
Tyrese Maxey,8.492308,15.430769,0.564569,2.630769,5.230769,3.861538,4.400000,0.353846,3.261538,3.615385,4.553846,1.015385,0.430769,1.276923,2.030769,23.476923,6.000000,19.929231,65
Brook Lopez,7.234375,12.296875,0.606578,2.031250,4.734375,2.734375,3.265625,2.156250,3.750000,5.906250,0.953125,0.703125,2.078125,0.937500,2.171875,19.234375,8.671875,17.385938,64


But what other ways can we calculate how valuable a player is, either offensively or defensively? To start, we can calculate how efficient certain players are by generating their points per possession. To do this, we have to calculate how many opsessions there are in a game, which can be theoretically calculated by using the formula of TPoss = FGAt + 0.44 * FTAt - OREBt + TO. Since FGAm, FTA, and TOs are typically how a posession ends, they are counted. For Free Throws, you can either shoot 1, 2, or up to 3 depending on the possession so 0.44 is the typical number since 3 pointers are rarer than 2s and 1s. Finally, Offensive Rebounds reset posessions and keep the possession to the same team