In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbn

# Load dataframes
awards_players = pd.read_csv('../data/awards_players.csv')
coaches = pd.read_csv('../data/coaches.csv')
players_teams = pd.read_csv('../data/players_teams.csv')
players = pd.read_csv('../data/players.csv')
series_post = pd.read_csv('../data/series_post.csv')
teams_post = pd.read_csv('../data/teams_post.csv')
teams = pd.read_csv('../data/teams.csv')

In [60]:
# Cleaning Awards Players
awards_players.iloc[:, :-1]  # Drop the last column # Drop lgID column since all values are 'WNBA'

#Add the missing "award" value for line 30
awards_players.at[28, 'award'] = "Kim Perrot Sportsmanship Award" 
awards_players.head(10)

Unnamed: 0,playerID,award,year,lgID
0,thompti01w,All-Star Game Most Valuable Player,1,WNBA
1,leslili01w,All-Star Game Most Valuable Player,2,WNBA
2,leslili01w,All-Star Game Most Valuable Player,3,WNBA
3,teaslni01w,All-Star Game Most Valuable Player,4,WNBA
4,swoopsh01w,All-Star Game Most Valuable Player,6,WNBA
5,douglka01w,All-Star Game Most Valuable Player,7,WNBA
6,fordch01w,All-Star Game Most Valuable Player,8,WNBA
7,cashsw01w,All-Star Game Most Valuable Player,10,WNBA
8,coopemi01w,Coach of the Year,1,WNBA
9,hugheda99w,Coach of the Year,2,WNBA


In [61]:
# We can drop 'lgID', since all values are 'WNBA'
players_teams.drop('lgID', axis=1, inplace=True)

In [62]:
# We can drop 'lgID', since all values are 'WNBA'
coaches.drop('lgID', axis=1, inplace=True)

In [63]:
# We can drop 'firstseason', since all values are '0'
players.drop('firstseason', axis=1, inplace=True)
# We can drop 'lastseason', since all values are '0'
players.drop('lastseason', axis=1, inplace=True)

In [64]:
# We don't believe that the 'rebounds' attribute is relevant, so we drop it
#players_teams.drop('rebounds', axis=1, inplace=True)

In [65]:
# We can drop 'lgID', since all values are 'WNBA'
teams.drop('lgID', axis=1, inplace=True)
# We can drop 'franchID', since values are the same as 'teamID'
teams.drop('franchID', axis=1, inplace=True)
# We can drop 'divID', since all values are null
teams.drop('divID', axis=1, inplace=True)
# We can drop 'seeded', since all values are 0
teams.drop('seeded', axis=1, inplace=True)
# We don't believe that the 'name' attribute is relevant, so we drop it
teams.drop('name', axis=1, inplace=True)
# All of these attributes are always 0, se we can simply drop them
teams.drop(['tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'], axis=1, inplace=True)
# We don't believe that the 'min' attribute is relevant, so we drop it
teams.drop('min', axis=1, inplace=True)
# We don't believe that the 'attend' attribute is relevant, so we drop it
teams.drop('attend', axis=1, inplace=True)
# We don't believe that the 'arena' attribute is relevant, so we drop it
teams.drop('arena', axis=1, inplace=True)

In [66]:
# We attribute points to each award, so we can compare them
# Defined points for each award
awards_players.loc[awards_players['award'] == 'All-Star Game Most Valuable Player', 'points'] = 10
awards_players.loc[awards_players['award'] == 'Coach of the Year', 'points'] = 5
awards_players.loc[awards_players['award'] == 'Defensive Player of the Year', 'points'] = 4
awards_players.loc[awards_players['award'] == 'Kim Perrot Sportsmanship Award', 'points'] = 0
awards_players.loc[awards_players['award'] == 'Most Improved Player', 'points'] = 2
awards_players.loc[awards_players['award'] == 'Most Valuable Player', 'points'] = 9
awards_players.loc[awards_players['award'] == 'Rookie of the Year', 'points'] = 1
awards_players.loc[awards_players['award'] == 'Sixth Woman of the Year', 'points'] = 3
awards_players.loc[awards_players['award'] == 'WNBA Finals Most Valuable Player', 'points'] = 7
awards_players.loc[awards_players['award'] == 'WNBA All-Decade Team', 'points'] = 8
awards_players.loc[awards_players['award'] == 'WNBA All Decade Team Honorable Mention', 'points'] = 6


awards_players['playerID'] = awards_players['playerID'].astype(str)
awards_players = awards_players.sort_values(by='playerID')

# If you want to reset the index after sorting
awards_players.reset_index(drop=True, inplace=True)

# Print the sorted DataFrame
awards_players.head(20)

Unnamed: 0,playerID,award,year,lgID,points
0,arcaija01w,Most Improved Player,2,WNBA,2.0
1,augusse01w,Rookie of the Year,7,WNBA,1.0
2,beviltu01w,Kim Perrot Sportsmanship Award,8,WNBA,0.0
3,birdsu01w,WNBA All-Decade Team,7,WNBA,8.0
4,blackde01w,Defensive Player of the Year,2,WNBA,4.0
5,boltoru01w,WNBA All Decade Team Honorable Mention,7,WNBA,6.0
6,bonnede01w,Sixth Woman of the Year,10,WNBA,3.0
7,campbed01w,Kim Perrot Sportsmanship Award,4,WNBA,0.0
8,cashsw01w,All-Star Game Most Valuable Player,10,WNBA,10.0
9,catchta01w,Rookie of the Year,3,WNBA,1.0


In [67]:
# Adding postseason statistics to regular season statistics for each player
# This is done to get the total statistics for the entire season (regular + postseason)
for year, lg_data in players_teams.groupby('year'):
    lg_stats = lg_data[['assists', 'PostAssists', 'fgMade', 'PostfgMade', 'ftMade', 'PostftMade', 
                        'turnovers', 'PostTurnovers', 'fgAttempted', 'PostfgAttempted', 'ftAttempted', 
                        'PostftAttempted', 'rebounds', 'PostRebounds', 'oRebounds', 'PostoRebounds', 
                        'PF', 'PostPF', 'points', 'PostPoints']].sum()
    
    lg_ast, lg_fg, lg_ft, lg_to, lg_fga, lg_fta, lg_trb, lg_orb, lg_pf, lg_pts = (
        lg_stats['assists'] + lg_stats['PostAssists'],
        lg_stats['fgMade'] + lg_stats['PostfgMade'],
        lg_stats['ftMade'] + lg_stats['PostftMade'],
        lg_stats['turnovers'] + lg_stats['PostTurnovers'],
        lg_stats['fgAttempted'] + lg_stats['PostfgAttempted'],
        lg_stats['ftAttempted'] + lg_stats['PostftAttempted'],
        lg_stats['rebounds'] + lg_stats['PostRebounds'],
        lg_stats['oRebounds'] + lg_stats['PostoRebounds'],
        lg_stats['PF'] + lg_stats['PostPF'],
        lg_stats['points'] + lg_stats['PostPoints']
    )

    # Calculate the performance of each player in each season
    factor = 2 / 3 - ((0.5 * lg_ast / lg_fg) / (2 * lg_fg / lg_ft))
    # Calculate the value of possessions
    vop = lg_pts / (lg_fga - lg_orb + lg_to + 0.44 * lg_fta)
    # Calculate the value of defensive rebounds
    drbp = (lg_trb - lg_orb) / lg_trb


    # Calculate the performance of each player in each team
    for tmID, tm_data in lg_data.groupby('tmID'):
        tm_ast = tm_data['assists'].sum() + tm_data['PostAssists'].sum()
        tm_fg = tm_data['fgMade'].sum() + tm_data['PostfgMade'].sum()

        for player, player_data in tm_data.groupby('playerID'):
            player_stats = player_data[['minutes', 'PostMinutes', 'threeMade', 'PostthreeMade', 'assists', 
                                        'PostAssists', 'fgMade', 'PostfgMade', 'ftMade', 'PostftMade', 
                                        'turnovers', 'PostTurnovers', 'fgAttempted', 'PostfgAttempted', 
                                        'ftAttempted', 'PostftAttempted', 'rebounds', 'PostRebounds', 
                                        'oRebounds', 'PostoRebounds', 'steals', 'PostSteals', 'blocks', 
                                        'PostBlocks', 'PF', 'PostPF']].sum()
            # retrieve the player's statistics
            min, _3p, ast, fg, ft, to, fga, fta, trb, orb, stl, blk, pf = (
                player_stats['minutes'] + player_stats['PostMinutes'],
                player_stats['threeMade'] + player_stats['PostthreeMade'],
                player_stats['assists'] + player_stats['PostAssists'],
                player_stats['fgMade'] + player_stats['PostfgMade'],
                player_stats['ftMade'] + player_stats['PostftMade'],
                player_stats['turnovers'] + player_stats['PostTurnovers'],
                player_stats['fgAttempted'] + player_stats['PostfgAttempted'],
                player_stats['ftAttempted'] + player_stats['PostftAttempted'],
                player_stats['rebounds'] + player_stats['PostRebounds'],
                player_stats['oRebounds'] + player_stats['PostoRebounds'],
                player_stats['steals'] + player_stats['PostSteals'],
                player_stats['blocks'] + player_stats['PostBlocks'],
                player_stats['PF'] + player_stats['PostPF']
            )

            # uper is a metric to evaluate the performance of each player based on various game stats
            uper = 1 / min * (_3p + (1.5 * ast) +
                              ((2 - factor * tm_ast / tm_fg) * fg) +
                              (0.5 * ft * (2 - 1 / 3 * tm_ast / tm_fg)) -
                              (vop * to) - (vop * drbp * (fga - fg)) -
                              (vop * 0.44 * (0.44 + (0.56 * drbp)) *
                               (fta - ft)) + (vop * (1 - drbp) * (trb - orb)) +
                              (vop * drbp * orb) + (vop * stl) +
                              (vop * drbp * blk) -
                              (pf *
                               (lg_ft / lg_pf - 0.44 * lg_fta / lg_pf * vop)))
            
            players_teams.loc[(players_teams['year'] == year) &
                              (players_teams['playerID'] == player) &
                              (players_teams['tmID'] == tmID),
                              'uper'] = uper

    lg_uper = players_teams[players_teams['year'] == year]['uper'].mean()
    players_teams.loc[
        players_teams['year'] == year,
        'performance'] = players_teams[players_teams['year'] == year]['uper'] * (
            15 / lg_uper)

players_teams['performance'].describe()

  uper = 1 / min * (_3p + (1.5 * ast) +
  uper = 1 / min * (_3p + (1.5 * ast) +
  uper = 1 / min * (_3p + (1.5 * ast) +
  uper = 1 / min * (_3p + (1.5 * ast) +


count    1874.000000
mean       15.000000
std         8.880203
min      -123.636226
25%        11.204041
50%        15.593892
75%        19.789928
max        73.331826
Name: performance, dtype: float64

In [68]:
for tmID, team_data in teams.groupby('tmID'):
    for year, current_year_data in team_data.groupby('year'):
        player_ids = players_teams[(players_teams['tmID'] == tmID) & (players_teams['year'] == year)]['playerID'].values
        valid_players = players[players['bioID'].isin(player_ids)]
        
        avg_performance = players_teams[(players_teams['tmID'] == tmID) & (players_teams['year'] < year)]['performance'].mean()
        avg_height = valid_players['height'].mean()
        
        teams.loc[(teams['tmID'] == tmID) & (teams['year'] == year), 'avg_performance'] = avg_performance
        teams.loc[(teams['tmID'] == tmID) & (teams['year'] == year), 'avg_height'] = avg_height

teams['avg_performance'].fillna(15, inplace=True)
teams.sort_values(by=['tmID', 'year'], inplace=True)

# Print the sorted DataFrame
teams.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  teams['avg_performance'].fillna(15, inplace=True)


Unnamed: 0,year,tmID,confID,rank,playoff,firstRound,semis,finals,o_fgm,o_fga,...,lost,GP,homeW,homeL,awayW,awayL,confW,confL,avg_performance,avg_height
0,9,ATL,EA,7,N,,,,895,2258,...,30,34,1,16,3,14,2,18,15.0,73.214286
1,10,ATL,EA,2,Y,L,,,1089,2428,...,16,34,12,5,6,11,10,12,15.353569,72.538462
2,1,CHA,EA,8,N,,,,812,1903,...,24,32,5,11,3,13,5,16,15.0,71.692308
3,2,CHA,EA,4,Y,W,W,L,746,1780,...,14,32,11,5,7,9,15,6,14.29237,71.5
4,3,CHA,EA,2,Y,L,,,770,1790,...,14,32,11,5,7,9,12,9,13.7988,71.307692
5,4,CHA,EA,2,Y,L,,,787,1881,...,16,34,13,4,5,12,12,12,15.378091,71.181818
6,5,CHA,EA,5,N,,,,745,1744,...,18,34,10,7,6,11,8,12,15.780483,72.0
7,6,CHA,EA,6,N,,,,772,1913,...,28,34,5,12,1,16,4,16,14.997988,72.0625
8,7,CHA,EA,6,N,,,,864,2178,...,23,34,7,10,4,13,6,14,14.726193,72.916667
9,7,CHI,EA,7,N,,,,858,2175,...,29,34,3,14,2,15,4,16,15.0,72.4


In [69]:
# Drop irrelevant columns
teams.drop(["rank","firstRound","semis","finals","o_fgm","o_fga","o_ftm","o_fta","o_3pm","o_3pa","o_oreb","o_dreb","o_reb","o_asts","o_pf","o_stl","o_to","o_blk","o_pts","d_fgm","d_fga","d_ftm","d_fta","d_3pm","d_3pa","d_oreb","d_dreb","d_reb","d_asts","d_pf","d_stl","d_to","d_blk","d_pts","won","lost","GP","homeW","homeL","awayW","awayL","confW","confL"], axis=1, inplace=True)
teams

Unnamed: 0,year,tmID,confID,playoff,avg_performance,avg_height
0,9,ATL,EA,N,15.000000,73.214286
1,10,ATL,EA,Y,15.353569,72.538462
2,1,CHA,EA,N,15.000000,71.692308
3,2,CHA,EA,Y,14.292370,71.500000
4,3,CHA,EA,Y,13.798800,71.307692
...,...,...,...,...,...,...
137,6,WAS,EA,N,13.355131,71.000000
138,7,WAS,EA,Y,13.831622,71.833333
139,8,WAS,EA,N,14.375448,72.600000
140,9,WAS,EA,N,14.640351,71.625000


In [70]:
# Export the cleaned dataframes to CSV files
teams.to_csv("../data/clean/teams.csv", index=False)